In [1]:
import pandas as pd

In [2]:
# read the data into a pandas dataframe
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfnonpro = data2df('HealthProNonPro/NonPro/', 0) 
dfpro = data2df('HealthProNonPro/Pro/', 1) 

df = pd.concat([dfnonpro,dfpro], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
1372,ans802.txt,It seems that previously you had an intercosta...,1
771,a61367.txt,the best way to relax is to put on quiet music...,0
1428,ans70.txt,Achilles tendonitis is an inflammation of the ...,1
133,ans377.txt,Home pregnancy test and especially EPTs are fa...,1
331,ans1100.txt,Taking stool softeners and milk of magnesia at...,1
1649,ans1493.txt,The symptoms that you are experiencing such as...,1
1112,ans1143.txt,Thank you for your question. Given your condit...,1
1329,ans1431.txt,The pain in one side of the tongue could be a ...,1
1055,a24793.txt,There is quite a history on them. Please check...,0
803,a31632.txt,Already Answered,0


In [3]:
# setup the data
X, y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
def preprocess(text):
    # replace one or more white-space characters with a space
    regex = re.compile(r"\s+")                               
    text = regex.sub(' ', text)    
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub(' ', text)           
    # remove stop words
    sw = stopwords.words('english')
    text = text.split()                                              
    text = ' '.join([w for w in text if w not in sw]) 
    # remove short words
    ' '.join([w for w in text.split() if len(w) >= 2])
    # stem
    # text = ' '.join([(PorterStemmer()).stem(w) for w in text.split()])
    # lemmatize
    text = ' '.join([(WordNetLemmatizer()).lemmatize(w) for w in text.split()]) 
    return text

Xtrain = [preprocess(text) for text in Xtrain]
Xtest = [preprocess(text) for text in Xtest]

In [5]:

def custom_tokenizer(doc):

    # clean up text
    tokens = [token.lemma_.lower() # lemmatize and lower-case 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are 2 or more characters long
                                    #token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve specific pos
                                    #token.text in nlp.vocab and # check if token in vocab
                                    #token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop # get rid of tokens that are stop words
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
nlpXtrain = nlp.pipe(Xtrain)
nlpXtest = nlp.pipe(Xtest)
Xtrain = [custom_tokenizer(doc) for doc in nlpXtrain]
Xtest = [custom_tokenizer(doc) for doc in nlpXtest]


In [6]:
# setup the preprocessing->model pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

  
clf = Pipeline(steps=[
    ('tfidf',TfidfVectorizer()),
    ('nb', MultinomialNB())
])


In [7]:
# setup grid search
import numpy as np
from sklearn.model_selection import GridSearchCV
param_grid = {
    'tfidf__sublinear_tf': [True, False], 
    'nb__alpha':np.linspace(1.0,1.5)
}
gscv = GridSearchCV(clf, param_grid, cv=4, return_train_score=False)

In [8]:
gscv.fit(Xtrain, ytrain)

print ("-"*100)
print(gscv.best_estimator_, "\n")
print ("-"*100)
print(gscv.best_score_, "\n")
print ("-"*100)
print(gscv.best_params_, "\n")
print ("-"*100)
print(gscv.cv_results_, "\n")
print ("-"*100)

----------------------------------------------------------------------------------------------------
Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.01020408

In [9]:
# predict and evaluate best_estimator_ on test data

ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9290586630286494
[[312  51]
 [  1 369]]
              precision    recall  f1-score   support

           0       1.00      0.86      0.92       363
           1       0.88      1.00      0.93       370

    accuracy                           0.93       733
   macro avg       0.94      0.93      0.93       733
weighted avg       0.94      0.93      0.93       733



In [13]:
from sklearn import metrics

TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()
overall_accuracy=round((TN+TP)/(TN+FP+FN+TP),2)
print("Overall Accuracy:",overall_accuracy)

precision1=round(TP/(TP+FP),2)
print("Precision for class 1:",precision1)

precision0=round(TN/(TN+FN),2)
print("Precision for class 0:",precision0)

recall1=round(TP/(TP+FN),2)
print("Recall for class 1:",recall1)

recall0=round(TN/(TN+FP),2)
print("Recall for class 0:",recall0)

F1Score1=round((2*precision1*recall1)/(precision1+recall1),2)
print("F1 Score for class 1:",F1Score1)

F1Score0=round((2*precision0*recall0)/(precision0+recall0),2)
print("F1 Score for class 0:",F1Score0)


Overall Accuracy: 0.93
Precision for class 1: 0.88
Precision for class 0: 1.0
Recall for class 1: 1.0
Recall for class 0: 0.86
F1 Score for class 1: 0.94
F1 Score for class 0: 0.92
