#### imports for the project

In [1]:
import nltk
from nltk.corpus import movie_reviews
import random
from prettytable import PrettyTable
import textwrap 
import numpy as np

#### Import excel with pandas Emotions

In [8]:
import pandas as pd
df_emotions = pd.read_excel('Diabetes-classification.xlsx', sheet_name ='Emotions')

# Preparing dataset
X = df_emotions.loc[:,'discussion_text']
y = df_emotions.loc[:,'Label']
# removes all duplicates from list 
Labels = list(dict.fromkeys(y)) 
# puts discussion_text to a str and tokenize it
raw_text = df_emotions['discussion_text'].str.cat()
tokens = nltk.word_tokenize(raw_text)
text = nltk.Text(tokens)

#### Multinominal NB classifer for Emotions

In [3]:
# the reviews will be stored as document pairs of words and category
X_list_of_words = [sentence.split(" ") for sentence in X]
documents = list(zip(X_list_of_words, y))

#give random order to the documents
random.shuffle(documents)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents[0:2]:
    feats = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Sadness
Trust
+------------------------------------------+---------+
|             Discussion text              | Emotion |
+------------------------------------------+---------+
| I'm,such,a,geek..,Ever,since,I,watched,a | Sadness |
| ,video,describing,the,biochemistry,of,fr |         |
| uctose,in,the,body,I,rarely,give,juice.. |         |
| ,Fructose,triggers,a,cascade,of,reaction |         |
| s,that,lead,to,obesity,,cardiovascular,d |         |
| isease,,type,2,diabetes,,among,other,thi |         |
|  ngs....,and,juice,has,way,too,much,of   |         |
|                                          |         |
|                                          |         |
| But,when,foods,are,high,in,sugar,,you're |  Trust  |
| ,body,can't,use,it,all,at,once..,Sugar,g |         |
| ets,broken,down,too,quickly..,Your,pancr |         |
| eas,releases,extra,insulin,,which,you,be |         |
| come,desensitized,to,over,time,(type,2,d |         |
| iabetes)..,You,get,a,sugar,rush,,and,the |       

In [10]:
print('total words from emotion corpus: ', len(text))

# load all the words in freq distribution
all_words = nltk.FreqDist(w.lower() for w in text)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words = all_words.most_common(2000)
print('most freq words: ', most_freq_words[100:110])

word_features = [word for (word, count) in most_freq_words]
print('word_features[:25]: ', word_features[:25])


total words from emotion corpus:  276377
most freq words:  [('think', 398), ('other', 398), ('does', 393), ('many', 389), ('only', 387), ('day', 385), ('time', 383), ('much', 380), ('help', 379), ('risk', 376)]
word_features[:25]:  ['i', '..', 'and', 'the', 'to', 'a', 'of', 'diabetes', '2', 'type', 'is', 'my', 'that', 'have', 'in', 'it', 'with', 'for', 'you', 'was', 'on', 'as', 'not', 'but', ')']


In [11]:
def get_document_features(document, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(document)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function
#words_doc = movie_reviews.words('pos/cv957_8737.txt')
words_doc = text

feat_dict = get_document_features(words_doc, word_features)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(i)': True, 'contains(..)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(my)': True, 'contains(that)': True, 'contains(have)': True, 'contains(in)': True, 'contains(it)': True, 'contains(with)': True, 'contains(for)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(not)': True, 'contains(but)': True, 'contains())': True}


In [12]:
#obtain feature sets for all movie reviews
featuresets = [(get_document_features(d,word_features), c) for (d,c) in documents]

#split into train and test set (you can experiment with distribution here)
train_set, test_set = featuresets[100:], featuresets[:100]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set)) 

classifier.show_most_informative_features(20)


0.18
Most Informative Features
         contains(bacon) = True            Anger : Trust  =     23.2 : 1.0
          contains(hold) = True            Anger : Trust  =     23.2 : 1.0
    contains(production) = True            Anger : Trust  =     23.2 : 1.0
          contains(rare) = True            Anger : Trust  =     23.2 : 1.0
         contains(crazy) = True             Fear : Trust  =     20.9 : 1.0
        contains(stated) = True             Fear : Trust  =     20.9 : 1.0
         contains(story) = True            Anger : Antici =     18.5 : 1.0
            contains(ie) = True             Fear : Antici =     16.6 : 1.0
  contains(intervention) = True             Fear : Antici =     16.6 : 1.0
       contains(suppose) = True             Fear : Trust  =     14.9 : 1.0
          contains(2006) = True            Anger : Trust  =     13.9 : 1.0
            contains(32) = True            Anger : Trust  =     13.9 : 1.0
            contains(49) = True            Anger : Trust  =     13.9 

In [13]:

sample_review = 'I think I got rid of my sicknesss and feel happy'

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))




result of sample review:  Joy


#### loads in Patient Journey labels

In [None]:
import pandas as pd
df_patient = pd.read_excel('Diabetes-classification.xlsx', sheet_name='Patient-journey') # Reads in excel
#print(df_patient.head(), df_patient.tail())