In [50]:
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import numpy as np
import random
import csv
import string
# from google.colab import drive
# drive.mount('/content/drive')

%autosave 60
doPrint=True
np.random.seed(10)
random.seed(10)

Sets = ["Train", "Test", "Valid"]
Data = {s: [] for s in Sets}
removePunc = str.maketrans("","", string.punctuation)
for s in Sets:
    with open(f'medical_dataset/{s.lower()}.csv', encoding="utf8") as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        for row in csv_reader:
            T = row["text"].translate(removePunc).lower().split()
            Data[s].append({"T": T,
                            "L": int(row["label"]),
                            "V": []})            
            
Ndic = 10000
All=[]
for d in Data["Train"]:
    for w in d["T"]:
        All.append(w)
V,C = np.unique(All,return_counts=True)
Arg = np.argsort(C)
Dict = list(V[Arg[::-1][:Ndic]])
Count= C[Arg][::-1][:Ndic]
np.savetxt("medical_text-vocab.txt", np.column_stack((Dict,range(1, Ndic+1), Count)),delimiter="\t",header="Dict, Frequency, Count", fmt="%1s")

for s in Sets:
    with open(f"medical_text-{s.lower()}.txt", mode='w', newline='') as Writer:
        if doPrint:
            W = csv.writer(Writer)
        for d in Data[s]:
            d["V"] = np.zeros((Ndic), dtype=np.int8)
            NW = []
            for t in d["T"]:
                try:
                    nw = Dict.index(t)
                    d["V"][nw] = d["V"][nw]+1
                    NW.append(nw)
                except:
                    pass
            if doPrint:
                Writer.write(" ".join(str(np.array(NW)+1)[1:-1].split())+'\t'+str(d["L"])+'\n')

            
def CVdata(gs):
    means, stds, params = gs.cv_results_['mean_test_score'], gs.cv_results_['std_test_score'], gs.cv_results_['params']
    for m, s, p in zip(means, stds, params):
        print(f"{m:.3f} (+/-{s*2:.3f}) for {p}")
    print()    


Autosaving every 60 seconds


In [49]:
" ".join(str(np.array(NW)+1)[1:-1].split())+'\t'+str(1)+'\n'

'2782 637 888 ... 2187 4681 1227\t1\n'

In [None]:
y_t = np.array([d["L"] for d in Data["Train"]])-1
X_t = np.array([d["V"] for d in Data["Train"]])>0

y_v = np.array([d["L"] for d in Data["Valid"]])-1
X_v = np.array([d["V"] for d in Data["Valid"]])>0

X_train = np.append(X_t, X_v, axis=0)
y_train = np.append(y_t, y_v, axis=0)

y_test =  np.array([d["L"] for d in Data["Test"]])-1
X_test = np.array([d["V"] for d in Data["Test"]])>0

# tf = -np.ones_like(y_t)
# tf = np.append(tf, np.zeros_like(y_v), axis=0)

# cv = PredefinedSplit(tf)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'f1_macro'

In [None]:
y_pred_random = np.random.randint(0,4,size=(len(y_test)))
print(f"Accuracy in random approach: {f1_score(y_test,y_pred_random, average='macro')*100:.2f}%")

U = np.column_stack(np.unique([d["L"] for d in Data["Train"] ], return_counts=True))
y_pred_mostFreq = [max(U, key=lambda x:x[1])[0]] * len(y_test)
print(f"Accuracy in most frequent approach (Class{y_pred_mostFreq[0]}): {f1_score(y_test,np.array(y_pred_mostFreq)-1, average='macro')*100:.2f}%")

Accuracy in random approach: 25.59%
Accuracy in most frequent approach (Class1): 14.18%


In [None]:
Params_BNB = [
    {
        'alpha': 10**np.linspace(0,1,21)
    }
]
gsBNB = GridSearchCV(BernoulliNB(), Params_BNB, scoring=scoring, cv=cv, n_jobs=-1, refit=True)
gsBNB.fit(X_train, y_train)

print(f"Best hyper-parameters: {gsBNB.best_params_} with Best {scoring} of {gsBNB.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsBNB.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsBNB.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsBNB.predict(X_test), average="macro")*100:.2f}%')

print('All Parameters:')
print(Params_BNB)
# CVdata(gsBNB)

Best hyper-parameters: {'alpha': 1.2589254117941673} with Best f1_macro of 46.51% in cross-validation.

f1-macro on Train set: 51.71%
f1-macro on Valid set: 50.55%
f1-macro on Test  set: 45.95%
All Parameters:
[{'alpha': array([ 1.        ,  1.12201845,  1.25892541,  1.41253754,  1.58489319,
        1.77827941,  1.99526231,  2.23872114,  2.51188643,  2.81838293,
        3.16227766,  3.54813389,  3.98107171,  4.46683592,  5.01187234,
        5.62341325,  6.30957344,  7.07945784,  7.94328235,  8.91250938,
       10.        ])}]


In [None]:
# a
params_DTC_noPruning = [
    {
        'min_samples_leaf': [2, 10, 50],  # 81
        'max_leaf_nodes': [50, 75, 100],  # 50
        'max_depth': [20, 25, 35, 60],  # 10
        'random_state': [0]
    }

]
gsDTC_noPruning = GridSearchCV(DecisionTreeClassifier(), params_DTC_noPruning, scoring=scoring, cv=cv, n_jobs=-1, refit=True, verbose=2)
gsDTC_noPruning.fit(X_train, y_train)

print(f"Best hyper-parameters: {gsDTC_noPruning.best_params_} with Best {scoring} of {gsDTC_noPruning.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsDTC_noPruning.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsDTC_noPruning.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsDTC_noPruning.predict(X_test), average="macro")*100:.2f}%')


print('All Parameters:')
print(params_DTC_noPruning)
# CVdata(gsDTC_noPruning)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 12.5min finished


Best hyper-parameters: {'max_depth': 25, 'max_leaf_nodes': 75, 'min_samples_leaf': 2, 'random_state': 0} with Best f1_macro of 61.37% in cross-validation.

f1-macro on Train set: 67.71%
f1-macro on Valid set: 64.78%
f1-macro on Test  set: 62.01%
All Parameters:
[{'min_samples_leaf': [2, 10, 50], 'max_leaf_nodes': [50, 75, 100], 'max_depth': [20, 25, 35, 60], 'random_state': [0]}]


In [None]:
params_DTC_postPruning = [
    {
        'criterion': ['entropy'], 
        'random_state': [0],
        'ccp_alpha': 10**np.linspace(-3,-1,21)
    }
]
gsDTC_postPruning = GridSearchCV(DecisionTreeClassifier(), params_DTC_postPruning, scoring=scoring, cv=cv, n_jobs=-1, refit=True)
gsDTC_postPruning.fit(X_train, y_train)
print(f"Best hyper-parameters: {gsDTC_postPruning.best_params_} with Best {scoring} of {gsDTC_postPruning.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsDTC_postPruning.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsDTC_postPruning.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsDTC_postPruning.predict(X_test), average="macro")*100:.2f}%')


print('All Parameters:')
print(params_DTC_postPruning)
# CVdata(gsDTC_postPruning)

Best hyper-parameters: {'ccp_alpha': 0.0031622776601683794, 'criterion': 'entropy', 'random_state': 0} with Best f1_macro of 59.70% in cross-validation.

f1-macro on Train set: 65.79%
f1-macro on Valid set: 64.41%
f1-macro on Test  set: 60.79%
All Parameters:
[{'criterion': ['entropy'], 'random_state': [0], 'ccp_alpha': array([0.001     , 0.00125893, 0.00158489, 0.00199526, 0.00251189,
       0.00316228, 0.00398107, 0.00501187, 0.00630957, 0.00794328,
       0.01      , 0.01258925, 0.01584893, 0.01995262, 0.02511886,
       0.03162278, 0.03981072, 0.05011872, 0.06309573, 0.07943282,
       0.1       ])}]


In [None]:
cv2 = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)
params_LR = [
    {
        "solver": ['saga'],
        "penalty":['l1'],
        "C": [1, 5, 10, 25, 50],
        "random_state": [0]
     },
     {
        "solver": ['saga'],
        "penalty":['l2'],
        "C": [1, 5, 10, 25, 50],
        "random_state": [0]
     }
]
gsLR = GridSearchCV(LogisticRegression(), params_LR, scoring=scoring, cv=cv2,n_jobs=-1, refit=True, verbose= 2) ######## 0.6 test 'accuracy' .8075 valid
gsLR.fit(X_train, y_train)
print(f"Best hyper-parameters: {gsLR.best_params_} with Best {scoring} of {gsLR.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsLR.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsLR.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsLR.predict(X_test), average="macro")*100:.2f}%')


print('All Parameters:')
print(params_LR)
# CVdata(gsLR)

Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 103.3min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 166.5min finished


Best hyper-parameters: {'C': 1, 'penalty': 'l1', 'random_state': 0, 'solver': 'saga'} with Best f1_macro of 52.70% in cross-validation.

f1-macro on Train set: 79.77%
f1-macro on Valid set: 79.77%
f1-macro on Test  set: 52.12%
All Parameters:
[{'solver': ['saga'], 'penalty': ['l1'], 'C': [1, 5, 10, 25, 50], 'random_state': [0]}, {'solver': ['saga'], 'penalty': ['l2'], 'C': [1, 5, 10, 25, 50], 'random_state': [0]}]


In [None]:
# a
from sklearn.svm import LinearSVC
params_SVC = [
     {
        'penalty': ['l2'],
        'loss': ['hinge', 'squared_hinge'],
        "C": [5, 10, 20, 35, 50, 75, 120],
        'random_state': [0]
    },
    {
        'penalty': ['l1'],
        'loss': ['squared_hinge'],
        "C": [5, 10, 20, 35, 50, 75, 120],
        'random_state': [0]
    }]

gsSVC = GridSearchCV(LinearSVC(), params_SVC, scoring=scoring, cv=cv, n_jobs=-1, refit=True, verbose= 2)
gsSVC.fit(X_train, y_train)
print(f"Best hyper-parameters: {gsSVC.best_params_} with Best {scoring} of {gsSVC.best_score_*100:.2f}% in cross-validation.\n")
print(f'f1-macro on Train set: {f1_score(y_t, gsSVC.predict(X_t), average="macro")*100:.2f}%')
print(f'f1-macro on Valid set: {f1_score(y_v, gsSVC.predict(X_v), average="macro")*100:.2f}%')
print(f'f1-macro on Test  set: {f1_score(y_test, gsSVC.predict(X_test), average="macro")*100:.2f}%')


print('All Parameters:')
print(params_SVC)
# CVdata(gsSVC)


Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:  4.8min finished


Best hyper-parameters: {'C': 10, 'loss': 'squared_hinge', 'penalty': 'l2', 'random_state': 0} with Best f1_macro of 54.73% in cross-validation.

f1-macro on Train set: 79.81%
f1-macro on Valid set: 78.66%
f1-macro on Test  set: 51.18%
All Parameters:
[{'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge'], 'C': [5, 10, 20, 35, 50, 75, 120], 'random_state': [0]}, {'penalty': ['l1'], 'loss': ['squared_hinge'], 'C': [5, 10, 20, 35, 50, 75, 120], 'random_state': [0]}]


