In [1]:
import numpy as np
import seaborn as sn
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, matthews_corrcoef
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import sys
import os.path

In [2]:
data_set = pd.read_csv(sys.path[0]+"/datasets/train_test.csv")
cancer_set = pd.read_csv(sys.path[0]+"/datasets/validation.csv")
data_abstracts = list(data_set["Abstract.Note"])
cancer_abstracts = list(cancer_set["Abstract.Note"])

In [4]:
import warnings
warnings.filterwarnings('ignore')

for label_class in ["Resilience", "Biomarkers", "Stressors", "Conditions"]:
    print(label_class)
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(data_abstracts, list(data_set[label_class]),random_state=42, test_size=0.1)
    clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
    parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2,2), (1,3)], 'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3, 1e-4)}
    grid_search = GridSearchCV(clf, parameters, n_jobs=-1)
    grid_search.fit(train_inputs, train_labels)
    print("Best training score", grid_search.best_score_)
    print("Best Parameters", grid_search.best_params_)
    y_pred = grid_search.predict(validation_inputs)
    cancer_pred = grid_search.predict(cancer_abstracts)
    print("Test Results for", label_class)
    print("Accuracy", accuracy_score(y_true=validation_labels, y_pred=y_pred))
    print("MM", matthews_corrcoef(y_true=validation_labels, y_pred=y_pred))
    print(classification_report(y_true=validation_labels, y_pred=y_pred))   
    print("Cancer Results for", label_class)
    print("Accuracy", accuracy_score(y_true=list(cancer_set[label_class]), y_pred=cancer_pred))
    print("MM", matthews_corrcoef(y_true=list(cancer_set[label_class]), y_pred=cancer_pred))
    print(classification_report(y_true=list(cancer_set[label_class]), y_pred=cancer_pred))   

Resilience
Best training score 0.7578918845684867
Best Parameters {'clf__alpha': 0.0001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 3)}
Test Results for Resilience
Accuracy 0.7697929354445798
MM 0.46544424369741444
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       567
           1       0.62      0.64      0.63       254

   micro avg       0.77      0.77      0.77       821
   macro avg       0.73      0.73      0.73       821
weighted avg       0.77      0.77      0.77       821

Cancer Results for Resilience
Accuracy 0.7591549295774648
MM 0.19388144602491975
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       589
           1       0.32      0.36      0.34       121

   micro avg       0.76      0.76      0.76       710
   macro avg       0.59      0.60      0.60       710
weighted avg       0.77      0.76      0.77       710

Biomarkers
Best training score 0.75436932664

In [5]:
for label_class in ["Resilience", "Biomarkers", "Stressors", "Conditions"]:
    print(label_class)
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(data_abstracts, list(data_set[label_class]),random_state=42, test_size=0.1)
    clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', MultinomialNB(alpha=1e-3))])

    clf.fit(train_inputs, train_labels)
    y_pred = clf.predict(validation_inputs)
    cancer_pred = clf.predict(cancer_abstracts)
    print("Test Results for", label_class)
    print("Accuracy", accuracy_score(y_true=validation_labels, y_pred=y_pred))
    print("MM", matthews_corrcoef(y_true=validation_labels, y_pred=y_pred))
    print(classification_report(y_true=validation_labels, y_pred=y_pred))   
    print("Cancer Results for", label_class)
    print("Accuracy", accuracy_score(y_true=list(cancer_set[label_class]), y_pred=cancer_pred))
    print("MM", matthews_corrcoef(y_true=list(cancer_set[label_class]), y_pred=cancer_pred))
    print(classification_report(y_true=list(cancer_set[label_class]), y_pred=cancer_pred))   

Resilience
Test Results for Resilience
Accuracy 0.7503045066991474
MM 0.46582922591303416
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       567
           1       0.58      0.74      0.65       254

   micro avg       0.75      0.75      0.75       821
   macro avg       0.72      0.75      0.73       821
weighted avg       0.78      0.75      0.76       821

Cancer Results for Resilience
Accuracy 0.7253521126760564
MM 0.19718048975792785
              precision    recall  f1-score   support

           0       0.87      0.78      0.83       589
           1       0.30      0.45      0.36       121

   micro avg       0.73      0.73      0.73       710
   macro avg       0.58      0.61      0.59       710
weighted avg       0.77      0.73      0.75       710

Biomarkers
Test Results for Biomarkers
Accuracy 0.7454323995127893
MM 0.4980117289293051
              precision    recall  f1-score   support

           0       0.80      0.