#### imports for the project

In [15]:
import nltk
from nltk.corpus import movie_reviews
import random
from prettytable import PrettyTable
import textwrap 
import numpy as np
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Import excel with pandas Emotions

In [45]:
import pandas as pd
df_emotions = pd.read_excel('Diabetes-classification.xlsx', sheet_name ='Emotions')

# Preparing dataset
x_emotion = df_emotions.loc[:,'discussion_text']
y_emotion = df_emotions.loc[:,'Label']
new_dataframe = list(zip(y_emotion, x_emotion))
new_df = pd.DataFrame(new_dataframe, columns=['Label', 'Discussion_text'])

# removes all duplicates from list 
Labels_emotion = list(dict.fromkeys(y_emotion)) 
# puts discussion_text to a str and tokenize it
raw_text_emotion = df_emotions['discussion_text'].str.cat()
tokens_emotion = nltk.word_tokenize(raw_text_emotion)
tokens_emotion_filtered = [word for word in tokens_emotion if word.isalnum()]
text_emotion = nltk.Text(tokens_emotion_filtered)

#### Multinominal NB classifer for Emotions

In [5]:
# the reviews will be stored as document pairs of words and category
X_list_of_words = [sentence.split(" ") for sentence in x_emotion]
documents = list(zip(X_list_of_words, y_emotion))

#give random order to the documents
random.shuffle(documents)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents[0:2]:
    feats = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Sadness
Joy
+------------------------------------------+---------+
|             Discussion text              | Emotion |
+------------------------------------------+---------+
| I,want,to,echo,Lyndol's,suggestion,to,ta | Sadness |
| lk,to,your,doctor,about,metformin--that' |         |
| s,what,most,T2s,start,out,with..,Do,you, |         |
| know,if,there's,a,particular,reason,that |         |
| ,your,doctor,prescribed,glucotrol?.,Here |         |
| 's,a,link,to,a,chart,with,info,about,var |         |
| ious,T2,oral,meds:,http://www.joslin.org |         |
| /info/oral_diabetes_medicatio,ns_summary |         |
|         _chart.html,I,don't,know         |         |
|                                          |         |
|                                          |         |
| For,anyone,who,tells,you,that,the,brain, |   Joy   |
| needs,carbohydrates,as,its,source,of,ene |         |
| rgy,,tell,them,that,research,has,shown,t |         |
| hat,most,of,the,brain,can,switch,to,usin |         

In [12]:
print('total words from emotion corpus: ', len(text_emotion))

# load all the words in freq distribution
all_words = nltk.FreqDist(w.lower() for w in text_emotion)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words = all_words.most_common(10000)
print('most freq words: ', most_freq_words[500:600])

word_features = [word for (word, count) in most_freq_words]
print('word_features[:25]: ', word_features[:25])


total words from emotion corpus:  248961
most freq words:  [('women', 58), ('needs', 58), ('true', 57), ('plan', 57), ('appointment', 57), ('cure', 57), ('8', 57), ('major', 56), ('oral', 56), ('energy', 56), ('active', 56), ('stay', 56), ('newly', 56), ('above', 56), ('instead', 56), ('water', 56), ('real', 55), ('love', 55), ('kind', 55), ('site', 55), ('mg', 55), ('grains', 54), ('further', 54), ('night', 54), ('become', 54), ('mother', 54), ('learn', 54), ('factor', 54), ('add', 53), ('goes', 53), ('example', 53), ('readings', 53), ('taken', 53), ('current', 53), ('reducing', 53), ('personal', 53), ('prescribed', 53), ('mom', 53), ('doc', 52), ('members', 52), ('causing', 52), ('questions', 52), ('interesting', 52), ('bmi', 52), ('thyroid', 52), ('cases', 52), ('association', 51), ('kidney', 51), ('beta', 51), ('asked', 51), ('must', 51), ('50', 51), ('hospital', 51), ('small', 51), ('huge', 51), ('talk', 50), ('longer', 50), ('generally', 50), ('habits', 50), ('nhs', 50), ('syndro

In [13]:
def get_document_features(document, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(document)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function
#words_doc = movie_reviews.words('pos/cv957_8737.txt')
words_doc = text_emotion

feat_dict = get_document_features(words_doc, word_features)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(i)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(my)': True, 'contains(that)': True, 'contains(have)': True, 'contains(in)': True, 'contains(it)': True, 'contains(with)': True, 'contains(for)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(not)': True, 'contains(but)': True, 'contains(this)': True, 'contains(are)': True}


In [14]:
#obtain feature sets for all movie reviews
featuresets = [(get_document_features(d,word_features), c) for (d,c) in documents]

#split into train and test set (you can experiment with distribution here) 100 - 100 og
train_set, test_set = featuresets[150:], featuresets[:150]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set)) 

classifier.show_most_informative_features(20)


0.18
Most Informative Features
         contains(bacon) = True            Anger : Trust  =     23.4 : 1.0
          contains(hold) = True            Anger : Trust  =     23.4 : 1.0
         contains(lover) = True            Anger : Trust  =     23.4 : 1.0
    contains(production) = True            Anger : Trust  =     23.4 : 1.0
          contains(rare) = True            Anger : Trust  =     23.4 : 1.0
         contains(shift) = True            Anger : Trust  =     23.4 : 1.0
        contains(stated) = True             Fear : Trust  =     20.9 : 1.0
           contains(2nd) = True            Anger : Antici =     18.8 : 1.0
    contains(incredibly) = True            Anger : Antici =     18.8 : 1.0
    contains(motivation) = True            Anger : Antici =     18.8 : 1.0
    contains(responding) = True            Anger : Antici =     18.8 : 1.0
          contains(bike) = True             Fear : Antici =     16.7 : 1.0
            contains(ie) = True             Fear : Antici =     16.7 

In [7]:

sample_review = "I think i got rid of my sickness and feel happy"

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))

result of sample review:  Joy


#### loads in Patient Journey labels

In [8]:
import pandas as pd
df_patient = pd.read_excel('Diabetes-classification.xlsx', sheet_name='Patient-journey') # Reads in excel

# Preparing dataset
x_journey = df_patient.loc[:,'discussion_text']
y_journey = df_patient.loc[:,'Label']
# removes all duplicates from list 
Labels_journey = list(dict.fromkeys(y_journey)) 
# puts discussion_text to a str and tokenize it
raw_text_journey = df_patient['discussion_text'].str.cat()
tokens_journey = nltk.word_tokenize(raw_text_journey)
text_journey = nltk.Text(tokens_journey)

In [9]:
# the reviews will be stored as document pairs of words and category
X_list_of_words_journey = [sentence.split(" ") for sentence in x_journey]
documents_journey = list(zip(X_list_of_words_journey, y_journey))

#give random order to the documents
random.shuffle(documents_journey)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents_journey[0:2]:
    feats = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Clinical Treatment
Clinical Treatment
+------------------------------------------+--------------------+
|             Discussion text              |      Emotion       |
+------------------------------------------+--------------------+
| Hello,i,am,new,here..,I,want,to,tell,you | Clinical Treatment |
| ,that,2,members,in,my,family,diagonesed, |                    |
| by,type,2,diabetes..,what,precautions,we |                    |
| ,need,to,concern,along,with,medical,pres |                    |
|                cription,?                |                    |
|                                          |                    |
|                                          |                    |
| Hello,everyone,,My,name,is,Nate,,I,am,a, | Clinical Treatment |
| 25,year,old,Male,and,I,was,diagnosed,wit |                    |
| h,Type,2,Diabetes,when,I,was,14..,I,was, |                    |
| fairly,heavy,at,that,age,(250Lbs),and,ha |                    |
| d,an,A1C,around,12..,I,was,put,on,in

In [10]:
print('total words from emotion corpus: ', len(text_journey))

# load all the words in freq distribution
all_words_journey = nltk.FreqDist(w.lower() for w in text_journey)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words_journey = all_words_journey.most_common(6000)
print('most freq words: ', most_freq_words_journey[100:110])

word_features_journey = [word for (word, count) in most_freq_words_journey]
print('word_features[:25]: ', word_features_journey[:25])


total words from emotion corpus:  116721
most freq words:  [('his', 165), ('exercise', 165), ('only', 159), ('she', 159), ('time', 159), ('well', 157), ('any', 157), ('glucose', 156), ('then', 155), ('disease', 154)]
word_features[:25]:  ['..', 'i', 'and', 'the', 'to', 'a', 'of', 'diabetes', '2', 'type', 'is', 'in', 'my', 'that', 'with', 'have', 'for', 'it', 'you', 'was', 'on', 'as', 'are', ')', '.']


In [11]:
def get_document_features_journey(documents_journey, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(documents_journey)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function
#words_doc = movie_reviews.words('pos/cv957_8737.txt')
words_doc = text_journey

feat_dict = get_document_features(words_doc, word_features_journey)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(..)': True, 'contains(i)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(in)': True, 'contains(my)': True, 'contains(that)': True, 'contains(with)': True, 'contains(have)': True, 'contains(for)': True, 'contains(it)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(are)': True, 'contains())': True, 'contains(.)': True}


In [12]:
#obtain feature sets for all movie reviews
featuresets_journey = [(get_document_features_journey(d,word_features_journey), c) for (d,c) in documents_journey]

#split into train and test set (you can experiment with distribution here) 100 - 100 og
train_set_journey, test_set_journey = featuresets_journey[200:], featuresets_journey[:100]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set_journey)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set_journey)) 

classifier.show_most_informative_features(20)


0.47
Most Informative Features
      contains(exercise) = True           Living : Living =     46.0 : 1.0
    contains(colleagues) = True           Altern : Living =     35.7 : 1.0
       contains(studies) = True           Altern : Living =     29.4 : 1.0
    contains(treatments) = True           Altern : Living =     29.4 : 1.0
            contains(35) = True           Altern : Living =     25.5 : 1.0
        contains(accept) = True           Altern : Living =     25.5 : 1.0
        contains(acidic) = True           Altern : Living =     25.5 : 1.0
      contains(addition) = True           Altern : Living =     25.5 : 1.0
    contains(additional) = True           Altern : Living =     25.5 : 1.0
        contains(agents) = True           Altern : Living =     25.5 : 1.0
      contains(approved) = True           Altern : Living =     25.5 : 1.0
        contains(bitter) = True           Altern : Living =     25.5 : 1.0
         contains(bones) = True           Altern : Living =     25.5 

In [13]:
sample_review = "My doctor told me to start running and go on a diet"

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features_journey(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))

result of sample review:  Living with diabetes - Nutrition
