In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import nltk
from nltk.corpus import stopwords

from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

from sklearn import metrics

In [2]:
all_annotation = pd.read_csv('./data/annotation_final.csv', index_col = 0)

In [3]:
Counter(all_annotation['Perceived_susceptibility'])

Counter({0: 4515, 1: 485})

In [4]:
print('HBM related', Counter(all_annotation['HBM_related']), '\n'
      'Perceived barriers', Counter(all_annotation['Perceived_barriers']), '\n'
     'Perceived benefits', Counter(all_annotation['Perceived_benefits']), '\n'
     'Perceived severity', Counter(all_annotation['Perceived_severity']), '\n'
     'Perceived susceptibility', Counter(all_annotation['Perceived_susceptibility']))

HBM related Counter({1: 2739, 0: 2261}) 
Perceived barriers Counter({0: 3838, 1: 1162}) 
Perceived benefits Counter({0: 3899, 1: 1101}) 
Perceived severity Counter({0: 4512, 1: 488}) 
Perceived susceptibility Counter({0: 4515, 1: 485})


Train-test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(all_annotation['read_text_clean2'], all_annotation['HBM_related'],
                                                   test_size = 0.2, random_state = 99)

Count vect

In [6]:
count_vect = CountVectorizer(stop_words = set(stopwords.words('english')), min_df = 0.01, max_df = 0.99, token_pattern = '(?u)\\b[A-Za-z][A-Za-z]+\\b')
X_train_count = count_vect.fit_transform(X_train)
X_test_count = count_vect.transform(X_test)

X_train_count.shape, X_test_count.shape

((4000, 183), (1000, 183))

In [7]:
feature_name = count_vect.get_feature_names()

TF-IDF

In [8]:
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
X_test_tfidf = tfidf_transformer.transform(X_test_count)

In [9]:
X_train_final = X_train_tfidf.toarray()
X_test_final = X_test_tfidf.toarray()

y_train_final = y_train.values
y_test_final = y_test.values

In [10]:
def find_best_param(clf, name, param_test):
    if param_test:
        gsearch = GridSearchCV(estimator = clf, param_grid = param_test, n_jobs = -1, cv = 5)
        gsearch.fit(X_train_final, y_train_final)
        best_param = gsearch.best_params_

        return best_param
    else:
        return None

In [11]:
def benchmark(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    
    if name !='Ridge_Classifier' and name != 'LinearSVC_L1' and name != 'LinearSVC_L2' and name != 'Perceptron' and name != 'Passive_Aggressive' and name != 'SGDClassifier_L1' and name != 'SGDClassifier_L2' and name != 'Elastic_Net_penalty':
        clf.fit(X_train_final, y_train_final)
        predictions = clf.predict(X_train_final)
        proba = clf.predict_proba(X_train_final)
        pred = (clf.predict_proba(X_train_final)[:,1] >= 0.45).astype(bool)
    else:
        clf2 = CalibratedClassifierCV(clf)
        clf2.fit(X_train_final, y_train_final)
        predictions = clf2.predict(X_train_final)
        proba = clf2.predict_proba(X_train_final)
        pred = (clf2.predict_proba(X_train_final)[:,1] >= 0.45).astype(bool)
        
    # find threshold
    threshold = Find_Optimal_Cutoff(y_train_final, proba[:,1])
    pred = (proba[:,1] >= threshold).astype(bool)
        
    accuracy = metrics.accuracy_score(y_train_final, predictions)
    aucscore = metrics.roc_auc_score(y_train_final, proba[:,1])
    
    precision_binary = metrics.precision_score(y_train_final, predictions, average='binary') 
    recall_binary = metrics.recall_score(y_train_final, predictions,average='binary') 
    fmeasure_binary = metrics.f1_score(y_train_final, predictions,average='binary')
    f1_micro = metrics.f1_score(y_train_final, predictions,average='micro')
    f1_macro = metrics.f1_score(y_train_final, predictions,average='macro')
    
    tn, fp, fn, tp = metrics.confusion_matrix(y_train_final, predictions).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    youden_index = sensitivity+specificity-1
    
    print()
    # clf_descr, score.mean(), score.std(), train_time, test_time
    return name, accuracy, aucscore, precision_binary, recall_binary, fmeasure_binary, f1_micro, f1_macro, sensitivity, specificity, youden_index, threshold

In [12]:
def heldout(clf,name,thre):
    print('_' * 80)
    print("Test: ")
    print(clf)
    
    if name != 'Ridge_Classifier' and name !='LinearSVC_L1' and name != 'LinearSVC_L2' and name != 'Perceptron' and name != 'Passive_Aggressive' and name != 'SGDClassifier_L1' and name != 'SGDClassifier_L2' and name != 'Elastic_Net_penalty':
        clf.fit(X_train_final, y_train_final)
        predictions = clf.predict(X_test_final)
        proba = clf.predict_proba(X_test_final)
    else:
        clf2 = CalibratedClassifierCV(clf)
        clf2.fit(X_train_final, y_train_final)
        predictions = clf2.predict(X_test_final)
        proba = clf2.predict_proba(X_test_final)
    
    # find threshold
    pred = (proba[:,1] >= thre).astype(bool)
    
    accuracy = metrics.accuracy_score(y_test_final, predictions)
    aucscore = metrics.roc_auc_score(y_test_final, proba[:,1])
    
    precision_binary = metrics.precision_score(y_test_final, predictions, average='binary') 
    recall_binary = metrics.recall_score(y_test_final, predictions,average='binary') 
    fmeasure_binary = metrics.f1_score(y_test_final, predictions,average='binary')
    f1_micro = metrics.f1_score(y_test_final, predictions,average='micro')
    f1_macro = metrics.f1_score(y_test_final, predictions,average='macro')
    
    tn, fp, fn, tp = metrics.confusion_matrix(y_test_final, predictions).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    youden_index = sensitivity+specificity-1
    
    print()
    print(metrics.classification_report(y_test_final, predictions, digits=4))
    # clf_descr, score.mean(), score.std(), train_time, test_time
    return name, accuracy, aucscore, precision_binary, recall_binary, fmeasure_binary, f1_micro, f1_macro, sensitivity, specificity, youden_index

CV to find the best parameters

In [None]:
best_params = []
for clf, name, param_test in (
        (RidgeClassifier(tol=1e-2, solver = 'sag', max_iter = 1000, class_weight = 'balanced', random_state = 99), "Ridge_Classifier", None),
        (Perceptron(max_iter=1000, class_weight = 'balanced'), "Perceptron", None),
        (PassiveAggressiveClassifier(max_iter=1000), "Passive_Aggressive", {'C':[0.01,0.1,1,10,100]}),
        (KNeighborsClassifier(n_neighbors=10), "kNN",{'n_neighbors': np.linspace(1,100,10,dtype=int), 'p':[1,2]}),
        (RandomForestClassifier(n_estimators=1000, class_weight = 'balanced', random_state = 99), "Random_forest", {'n_estimators':np.linspace(20,300,10,dtype=int),'max_depth':np.linspace(1,21,5,dtype=int)}),
        (LinearSVC(penalty="l1",dual=False, tol=1e-3, class_weight = 'balanced', max_iter=10000),"LinearSVC_L1", {'C':[0.01, 0.1, 1,10,100]}),
        (LinearSVC(penalty="l2",dual=False, tol=1e-3, class_weight = 'balanced'),"LinearSVC_L2", {'C':[0.01, 0.1, 1,10,100]}),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="l1", class_weight = 'balanced', random_state = 99),"SGDClassifier_L1", None),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="l2", class_weight = 'balanced', random_state = 99),"SGDClassifier_L2", None),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="elasticnet", class_weight = 'balanced', random_state = 99),"Elastic_Net_penalty", None),
        #(NearestCentroid(),"NearestCentroid" ),
        (MultinomialNB(alpha=.01),"MultinomialNB", None),
        (BernoulliNB(alpha=.01),"BernoulliNB", None),
        (LogisticRegression(C=10, class_weight = 'balanced', max_iter=10000, random_state = 99), "Logistic_Regression", {'C':[0.01,0.1,1,10,100]}),
        (SVC(C=1, class_weight = 'balanced', kernel = 'rbf', probability = True, random_state = 99), 'SVC_rbf', {'C':[0.01,0.1,1,10,100]}),
        (SVC(C=1, class_weight = 'balanced', kernel = 'poly', probability = True, random_state = 99), 'SVC_poly', {'C':[0.01,0.1,1,10,100]}),
        (SVC(C=1, class_weight = 'balanced', kernel = 'sigmoid', probability = True, random_state = 99), 'SVC_sigmoid', {'C':[0.01,0.1,1,10,100]})
        ):
    
    best_param = find_best_param(clf, name, param_test)
    best_params.append(best_param)

Training

In [67]:
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver='sag', max_iter = 1000, class_weight = 'balanced', random_state = 99), "Ridge_Classifier"),
        (Perceptron(max_iter=1000, class_weight = 'balanced', random_state = 99), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=1000, C = best_params[2]['C'], random_state = 99), "Passive_Aggressive"),
        (KNeighborsClassifier(n_neighbors = best_params[3]['n_neighbors'], p = best_params[3]['p']), "kNN"),
        (RandomForestClassifier(class_weight = 'balanced', max_depth = best_params[4]['max_depth'], n_estimators = best_params[4]['n_estimators'], random_state = 99), "Random_forest"),
        (LinearSVC(max_iter=10000, penalty="l1",dual=False, tol=1e-3, class_weight = 'balanced', C = best_params[5]['C'], random_state = 99),"LinearSVC_L1"),
        (LinearSVC(penalty="l2",dual=False, tol=1e-3, class_weight = 'balanced', C= best_params[6]['C'], random_state = 99),"LinearSVC_L2"),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="l1", class_weight = 'balanced', random_state = 99),"SGDClassifier_L1"),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="l2", class_weight = 'balanced', random_state = 99),"SGDClassifier_L2"),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="elasticnet", class_weight = 'balanced', random_state = 99),"Elastic_Net_penalty"),
        #(NearestCentroid(),"NearestCentroid" ),
        (MultinomialNB(alpha=.01),"MultinomialNB"),
        (BernoulliNB(alpha=.01),"BernoulliNB"),
        (LogisticRegression(class_weight = 'balanced', C = best_params[12]['C'], max_iter = 1000, random_state = 99), "Logistic_Regression"),
        (SVC(class_weight = 'balanced', kernel = 'rbf', C = best_params[13]['C'], probability = True, random_state = 99), 'SVC_rbf'),
        (SVC(C=best_params[14]['C'], class_weight = 'balanced', kernel = 'poly', probability = True, random_state = 99), 'SVC_poly'),
        (SVC(C=best_params[15]['C'], class_weight = 'balanced', kernel = 'sigmoid', probability = True, random_state = 99), 'SVC_sigmoid')
):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf,name))

results = [[x[i] for x in results] for i in range(12)]
names, accuracy, aucscore, precision_binary, recall_binary, fmeasure_binary, f1_micro, f1_macro, sensitivity, specificity, youden_index, threshold = results



Ridge_Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight='balanced', copy_X=True,
                fit_intercept=True, max_iter=1000, normalize=False,
                random_state=99, solver='sag', tol=0.01)

Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight='balanced', early_stopping=False,
           eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5,
           n_jobs=None, penalty=None, random_state=99, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

Passive_Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=0.01, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1

Testing

In [87]:
results2 = []
for clf, name, thre in (
        (RidgeClassifier(tol=1e-2, solver='sag', max_iter = 1000, class_weight = 'balanced', random_state = 99), "Ridge_Classifier", threshold[0]),
        (Perceptron(max_iter=1000, class_weight = 'balanced', random_state = 99), "Perceptron", threshold[1]),
        (PassiveAggressiveClassifier(max_iter=1000, C = best_params[2]['C'], random_state = 99), "Passive_Aggressive", threshold[2]),
        (KNeighborsClassifier(n_neighbors = best_params[3]['n_neighbors'], p = best_params[3]['p']), "kNN", threshold[3]),
        (RandomForestClassifier(class_weight = 'balanced', max_depth = best_params[4]['max_depth'], n_estimators = best_params[4]['n_estimators'], random_state = 99), "Random_forest", threshold[4]),
        (LinearSVC(penalty="l1",dual=False, tol=1e-3, class_weight = 'balanced', C = best_params[5]['C'], max_iter = 10000, random_state = 99),"LinearSVC_L1", threshold[5]),
        (LinearSVC(penalty="l2",dual=False, tol=1e-3, class_weight = 'balanced', C= best_params[6]['C'], max_iter = 1000, random_state = 99),"LinearSVC_L2", threshold[6]),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="l1", class_weight = 'balanced', random_state = 99),"SGDClassifier_L1", threshold[7]),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="l2", class_weight = 'balanced', random_state = 99),"SGDClassifier_L2", threshold[8]),
        (SGDClassifier(alpha=.0001, max_iter=1000,penalty="elasticnet", class_weight = 'balanced', random_state = 99),"Elastic_Net_penalty", threshold[9]),
        #(NearestCentroid(),"NearestCentroid" ),
        (MultinomialNB(alpha=.01),"MultinomialNB", threshold[10]),
        (BernoulliNB(alpha=.01),"BernoulliNB", threshold[11]),
        (LogisticRegression(class_weight = 'balanced', C = best_params[12]['C'], max_iter=1000, random_state = 99), "Logistic_Regression", threshold[10]),
        (SVC(class_weight = 'balanced', C = best_params[13]['C'], kernel = 'rbf',probability = True, random_state = 99), 'SVC_rbf', threshold[11]),
        (SVC(C=best_params[14]['C'], class_weight = 'balanced', kernel = 'poly', probability = True, random_state = 99), 'SVC_poly', threshold[12]),
        (SVC(C=best_params[15]['C'], class_weight = 'balanced', kernel = 'sigmoid', probability = True, random_state = 99), 'SVC_sigmoid', threshold[13])
):
    print('=' * 80)
    print(name)
    results2.append(heldout(clf,name,thre))  

results2 = [[x[i] for x in results2] for i in range(11)]
names2, accuracy2, aucscore2, precision_binary2, recall_binary2, fmeasure_binary2, f1_micro2, f1_macro2, sensitivity2, specificity2, youden_index2  = results2


Ridge_Classifier
________________________________________________________________________________
Test: 
RidgeClassifier(alpha=1.0, class_weight='balanced', copy_X=True,
                fit_intercept=True, max_iter=1000, normalize=False,
                random_state=99, solver='sag', tol=0.01)

              precision    recall  f1-score   support

           0     0.8534    0.7831    0.8167       461
           1     0.8267    0.8850    0.8548       539

    accuracy                         0.8380      1000
   macro avg     0.8401    0.8340    0.8358      1000
weighted avg     0.8390    0.8380    0.8373      1000

Perceptron
________________________________________________________________________________
Test: 
Perceptron(alpha=0.0001, class_weight='balanced', early_stopping=False,
           eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5,
           n_jobs=None, penalty=None, random_state=99, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, war


              precision    recall  f1-score   support

           0     0.7863    0.8460    0.8150       461
           1     0.8591    0.8033    0.8303       539

    accuracy                         0.8230      1000
   macro avg     0.8227    0.8247    0.8227      1000
weighted avg     0.8255    0.8230    0.8233      1000

Logistic_Regression
________________________________________________________________________________
Test: 
LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=99, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

              precision    recall  f1-score   support

           0     0.8352    0.8134    0.8242       461
           1     0.8439    0.8627    0.8532       539

    accuracy                         0.8400      1000
   macro avg    

Test results

In [97]:
BoW_distant_test_gridSearch = [names2,
                      pd.DataFrame(accuracy2).mean(axis = 1), 
                      pd.DataFrame(aucscore2).mean(axis = 1), 
                      pd.DataFrame(precision_binary2).mean(axis = 1),
                      pd.DataFrame(recall_binary2).mean(axis = 1),
                      pd.DataFrame(fmeasure_binary2).mean(axis = 1),
                      pd.DataFrame(f1_micro2).mean(axis = 1),
                      pd.DataFrame(f1_macro2).mean(axis = 1),
                      pd.DataFrame(sensitivity2).mean(axis = 1),
                      pd.DataFrame(specificity2).mean(axis = 1),
                      pd.DataFrame(youden_index2).mean(axis = 1)]
BoW_distant_test_gridSearch = pd.DataFrame(BoW_distant_test_gridSearch).T
BoW_distant_test_gridSearch.columns = ['clf_names', 'accuracy','aucscore','precision_binary', 'recall_binary','f1_binary','f1_micro', 'f1_macro', 'sensitivity', 'specificity', 'youden_index']

BoW_distant_test_gridSearch

Unnamed: 0,clf_names,accuracy,aucscore,precision_binary,recall_binary,f1_binary,f1_micro,f1_macro,sensitivity,specificity,youden_index
0,Ridge_Classifier,0.838,0.898225,0.82669,0.884972,0.854839,0.838,0.83579,0.884972,0.78308,0.668052
1,Perceptron,0.826,0.889286,0.813036,0.879406,0.84492,0.826,0.823371,0.879406,0.763557,0.642964
2,Passive_Aggressive,0.851,0.886437,0.846975,0.883117,0.864668,0.851,0.849464,0.883117,0.813449,0.696566
3,kNN,0.707,0.713447,0.78341,0.630798,0.698869,0.707,0.706786,0.630798,0.796095,0.426893
4,Random_forest,0.841,0.903799,0.847985,0.858998,0.853456,0.841,0.839843,0.858998,0.819957,0.678955
5,LinearSVC_L1,0.828,0.900861,0.818024,0.875696,0.845878,0.828,0.825654,0.875696,0.772234,0.64793
6,LinearSVC_L2,0.842,0.89443,0.834798,0.881262,0.857401,0.842,0.840135,0.881262,0.796095,0.677357
7,SGDClassifier_L1,0.85,0.894015,0.837088,0.896104,0.865591,0.85,0.847954,0.896104,0.796095,0.692199
8,SGDClassifier_L2,0.85,0.894603,0.84063,0.890538,0.864865,0.85,0.848163,0.890538,0.802603,0.693141
9,Elastic_Net_penalty,0.85,0.893625,0.838261,0.894249,0.86535,0.85,0.848025,0.894249,0.798265,0.692513


Parameters for the best classifier (Random Forest in this case)

In [98]:
best_params[4]['max_depth']

21

In [99]:
best_params[4]['n_estimators']

82