# print data!

In [None]:
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import numpy as np
import random
import csv
import string
from google.colab import drive
drive.mount('/content/drive')

%autosave 60
doPrint=True
np.random.seed(10)
random.seed(10)

Sets = ["Train", "Test", "Valid"]
Data = {s: [] for s in Sets}
removePunc = str.maketrans("","", string.punctuation)
for s in Sets:
    with open(f'medical_dataset/{s.lower()}.csv', encoding="utf8") as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            T = row["text"].translate(removePunc).lower().split()
            Data[s].append({"T": T,
                            "L": int(row["label"]),
                            "V": []})            
            
Ndic = 10000
All=[]
for d in Data["Train"]:
    for w in d["T"]:
        All.append(w)
V,C = np.unique(All,return_counts=True)
Arg = np.argsort(C)
Dict = list(V[Arg[::-1][:Ndic]])
Count= C[Arg][::-1][:Ndic]
np.savetxt("medical_text-vocab.txt", np.column_stack((Dict,range(1, Ndic+1), Count)),delimiter="\t",header="Dict, Frequency, Count", fmt="%1s")

for s in Sets:
    with open(f"medical_text-{s.lower()}.txt", mode='w', newline='') as Writer:
        if doPrint:
            W = csv.writer(Writer)
        for d in Data[s]:
            d["V"] = np.zeros((Ndic), dtype=np.int8)
            NW = []
            for t in d["T"]:
                try:
                    nw = Dict.index(t)
                    d["V"][nw] = d["V"][nw]+1
                    NW.append(nw)
                except:
                    pass
            if doPrint:
                Writer.write(" ".join(str(np.array(NW)+1)[1:-1].split())+'\t'+str(d["L"])+'\n')

            
def CVdata(gs):
    means, stds, params = gs.cv_results_['mean_test_score'], gs.cv_results_['std_test_score'], gs.cv_results_['params']
    for m, s, p in zip(means, stds, params):
        print(f"{m:.3f} (+/-{s*2:.3f}) for {p}")
    print()    


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Autosaving every 60 seconds


In [None]:
y_t = np.array([d["L"] for d in Data["Train"]])-1
X_t = [d["V"] for d in Data["Train"]]
X_t = X_t/np.sum(X_t, axis=1).reshape((len(y_t),1))

y_v = np.array([d["L"] for d in Data["Valid"]])-1
X_v = [d["V"] for d in Data["Valid"]]
X_v = X_v/np.sum(X_v, axis=1).reshape((len(y_v),1))


X_train = np.append(X_t, X_v, axis=0)
y_train = np.append(y_t, y_v, axis=0)

y_test =  np.array([d["L"] for d in Data["Test"]])-1
X_test = [d["V"] for d in Data["Test"]]
X_test = X_test/np.sum(X_test, axis=1).reshape((len(y_test),1))


# tf = -np.ones_like(y_t)
# tf = np.append(tf, np.zeros_like(y_v), axis=0)

# cv = PredefinedSplit(tf)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'f1_macro'

In [None]:
y_pred_random = np.random.randint(0,4,size=(len(y_test)))
print(f"Accuracy in random approach: {f1_score(y_test,y_pred_random, average='macro')*100:.2f}%")

U = np.column_stack(np.unique([d["L"] for d in Data["Train"] ], return_counts=True))
y_pred_mostFreq = [max(U, key=lambda x:x[1])[0]] * len(y_test)
print(f"Accuracy in most frequent approach (Class {y_pred_mostFreq[0]}): {f1_score(y_test,np.array(y_pred_mostFreq)-1, average='macro')*100:.2f}%")

Accuracy in random approach: 25.59%
Accuracy in most frequent approach (Class 1): 14.18%


In [None]:
# a
Params_GNB = [
    {
        'var_smoothing': 10**np.linspace(-3,-2.5,10)
    }
]
gsGNB = GridSearchCV(GaussianNB(), Params_GNB, scoring=scoring, cv=cv, n_jobs=-1, refit=True)
gsGNB.fit(X_train, y_train)

print(f"Best hyper-parameters: {gsGNB.best_params_} with Best {scoring} of {gsGNB.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsGNB.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsGNB.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsGNB.predict(X_test), average="macro")*100:.2f}%')

print('All Parameters:')
print(Params_GNB)
# CVdata(gsGNB)

Best hyper-parameters: {'var_smoothing': 0.0014677992676220691} with Best f1_macro of 44.27% in cross-validation.

f1-macro on Train set: 54.26%
f1-macro on Valid set: 53.18%
f1-macro on Test  set: 44.13%
All Parameters:
[{'var_smoothing': array([0.001     , 0.00113646, 0.00129155, 0.0014678 , 0.0016681 ,
       0.00189574, 0.00215443, 0.00244844, 0.00278256, 0.00316228])}]


In [None]:
# a
params_DTC_noPruning = [
    {
        'min_samples_leaf': [2, 5, 10, 50],  # 81
        'max_leaf_nodes': [35, 50, 75],  # 50
        'max_depth': [10, 15, 20, 25],  # 10
        'random_state': [0]
    }

]
gsDTC_noPruning = GridSearchCV(DecisionTreeClassifier(), params_DTC_noPruning, scoring=scoring, cv=cv, n_jobs=-1, refit=True, verbose=2)
gsDTC_noPruning.fit(X_train, y_train)

print(f"Best hyper-parameters: {gsDTC_noPruning.best_params_} with Best {scoring} of {gsDTC_noPruning.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsDTC_noPruning.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsDTC_noPruning.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsDTC_noPruning.predict(X_test), average="macro")*100:.2f}%')

print('All Parameters:')
print(params_DTC_noPruning)
# CVdata(gsDTC_noPruning)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 18.1min finished


Best hyper-parameters: {'max_depth': 20, 'max_leaf_nodes': 50, 'min_samples_leaf': 2, 'random_state': 0} with Best f1_macro of 62.21% in cross-validation.

f1-macro on Train set: 67.35%
f1-macro on Valid set: 66.88%
f1-macro on Test  set: 64.47%
All Parameters:
[{'min_samples_leaf': [2, 5, 10, 50], 'max_leaf_nodes': [35, 50, 75], 'max_depth': [10, 15, 20, 25], 'random_state': [0]}]


In [None]:
params_DTC_postPruning = [
    {
        'criterion': ['entropy'], 
        'random_state': [0],
        'ccp_alpha': 10**np.linspace(-4,0,21)
    }
]
gsDTC_postPruning = GridSearchCV(DecisionTreeClassifier(), params_DTC_postPruning, scoring=scoring, cv=cv, n_jobs=-1, refit=True)
gsDTC_postPruning.fit(X_train, y_train)
print(f"Best hyper-parameters: {gsDTC_postPruning.best_params_} with Best {scoring} of {gsDTC_postPruning.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsDTC_postPruning.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsDTC_postPruning.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsDTC_postPruning.predict(X_test), average="macro")*100:.2f}%')

print('All Parameters:')
print(params_DTC_postPruning)
# CVdata(gsDTC_postPruning)

Best hyper-parameters: {'ccp_alpha': 0.003981071705534973, 'criterion': 'entropy', 'random_state': 0} with Best f1_macro of 59.57% in cross-validation.

f1-macro on Train set: 65.49%
f1-macro on Valid set: 62.29%
f1-macro on Test  set: 63.59%
All Parameters:
[{'criterion': ['entropy'], 'random_state': [0], 'ccp_alpha': array([1.00000000e-04, 1.58489319e-04, 2.51188643e-04, 3.98107171e-04,
       6.30957344e-04, 1.00000000e-03, 1.58489319e-03, 2.51188643e-03,
       3.98107171e-03, 6.30957344e-03, 1.00000000e-02, 1.58489319e-02,
       2.51188643e-02, 3.98107171e-02, 6.30957344e-02, 1.00000000e-01,
       1.58489319e-01, 2.51188643e-01, 3.98107171e-01, 6.30957344e-01,
       1.00000000e+00])}]


In [None]:
cv2 = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)
params_LR = [
    {
        "solver": ['saga'],
        "penalty":['l1'],
        "C": [1, 5, 10, 25, 50],
        "random_state": [0]
     },
     {
        "solver": ['saga'],
        "penalty":['l2'],
        "C": [1, 5, 10, 25, 50],
        "random_state": [0]
     }
]
gsLR = GridSearchCV(LogisticRegression(), params_LR, scoring=scoring, cv=cv2,n_jobs=-1, refit=True, verbose= 2)
gsLR.fit(X_train, y_train)
print(f"Best hyper-parameters: {gsLR.best_params_} with Best {scoring} of {gsLR.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsLR.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsLR.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsLR.predict(X_test), average="macro")*100:.2f}%')

print('All Parameters:')
print(params_LR)
# CVdata(gsLR)

Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 83.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 124.9min finished


Best hyper-parameters: {'C': 25, 'penalty': 'l1', 'random_state': 0, 'solver': 'saga'} with Best f1_macro of 48.84% in cross-validation.

f1-macro on Train set: 55.43%
f1-macro on Valid set: 52.82%
f1-macro on Test  set: 47.26%
All Parameters:
[{'solver': ['saga'], 'penalty': ['l1'], 'C': [1, 5, 10, 25, 50], 'random_state': [0]}, {'solver': ['saga'], 'penalty': ['l2'], 'C': [1, 5, 10, 25, 50], 'random_state': [0]}]


In [None]:
# a
from sklearn.svm import LinearSVC
params_SVC = [
     {
        'penalty': ['l2'],
        'loss': ['hinge', 'squared_hinge'],
        "C": [5, 10, 20, 35, 50, 75, 120],
        'random_state': [0]
    },
    {
        'penalty': ['l1'],
        'loss': ['squared_hinge'],
        "C": [5, 10, 20, 35, 50, 75, 120],
        'random_state': [0]
    }]

gsSVC = GridSearchCV(LinearSVC(), params_SVC, scoring=scoring, cv=cv, n_jobs=-1, refit=True, verbose= 2)
gsSVC.fit(X_train, y_train)
print(f"Best hyper-parameters: {gsSVC.best_params_} with Best {scoring} of {gsSVC.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsSVC.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsSVC.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsSVC.predict(X_test), average="macro")*100:.2f}%')

print('All Parameters:')
print(params_SVC)
# CVdata(gsSVC)


Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:  4.5min finished


Best hyper-parameters: {'C': 20, 'loss': 'squared_hinge', 'penalty': 'l2', 'random_state': 0} with Best f1_macro of 44.77% in cross-validation.

f1-macro on Train set: 59.06%
f1-macro on Valid set: 57.21%
f1-macro on Test  set: 40.24%
All Parameters:
[{'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge'], 'C': [5, 10, 20, 35, 50, 75, 120], 'random_state': [0]}, {'penalty': ['l1'], 'loss': ['squared_hinge'], 'C': [5, 10, 20, 35, 50, 75, 120], 'random_state': [0]}]
