#### imports for the project

In [1]:
import nltk
from nltk.corpus import movie_reviews
import random
from prettytable import PrettyTable
import textwrap 
import numpy as np

#### Import excel with pandas Emotions

In [2]:
import pandas as pd
df_emotions = pd.read_excel('Diabetes-classification.xlsx', sheet_name ='Emotions')

# Preparing dataset
x_emotion = df_emotions.loc[:,'discussion_text']
y_emotion = df_emotions.loc[:,'Label']
# removes all duplicates from list 
Labels_emotion = list(dict.fromkeys(y_emotion)) 
# puts discussion_text to a str and tokenize it
raw_text_emotion = df_emotions['discussion_text'].str.cat()
tokens_emotion = nltk.word_tokenize(raw_text_emotion)
text_emotion = nltk.Text(tokens_emotion)

#### Multinominal NB classifer for Emotions

In [3]:
# the reviews will be stored as document pairs of words and category
X_list_of_words = [sentence.split(" ") for sentence in x_emotion]
documents = list(zip(X_list_of_words, y_emotion))

#give random order to the documents
random.shuffle(documents)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents[0:2]:
    feats = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Anticipation
Surprise
+------------------------------------------+--------------+
|             Discussion text              |   Emotion    |
+------------------------------------------+--------------+
| If,that's,the,case,,insulin,deficiency,i | Anticipation |
| sn't,their,problem,,so,more,insulin,shou |              |
| ldn't,be,the,solution..,Adding,insulin,i |              |
| s,no,way,to,deal,with,insulin,resistance |              |
| ,,it's,just,throwing,fuel,on,the,fire.., |              |
| Type,2,diabetes,is,characterized,by,insu |              |
| lin,resistance,actually,,and,that's,very |              |
|                 ,widely                  |              |
|                                          |              |
|                                          |              |
| I,recorded,the,programme,and,finally,wat |   Surprise   |
| ched,it,last,night..,I,was,annoyed,that, |              |
| he,kept,saying,that,T2,diabetes,is,due,t |              |
| o,lifestyle,choi

In [4]:
print('total words from emotion corpus: ', len(text_emotion))

# load all the words in freq distribution
all_words = nltk.FreqDist(w.lower() for w in text_emotion)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words = all_words.most_common(3000)
print('most freq words: ', most_freq_words[100:110])

word_features = [word for (word, count) in most_freq_words]
print('word_features[:25]: ', word_features[:25])


total words from emotion corpus:  276377
most freq words:  [('think', 398), ('other', 398), ('does', 393), ('many', 389), ('only', 387), ('day', 385), ('time', 383), ('much', 380), ('help', 379), ('risk', 376)]
word_features[:25]:  ['i', '..', 'and', 'the', 'to', 'a', 'of', 'diabetes', '2', 'type', 'is', 'my', 'that', 'have', 'in', 'it', 'with', 'for', 'you', 'was', 'on', 'as', 'not', 'but', ')']


In [5]:
def get_document_features(document, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(document)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function
#words_doc = movie_reviews.words('pos/cv957_8737.txt')
words_doc = text_emotion

feat_dict = get_document_features(words_doc, word_features)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(i)': True, 'contains(..)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(my)': True, 'contains(that)': True, 'contains(have)': True, 'contains(in)': True, 'contains(it)': True, 'contains(with)': True, 'contains(for)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(not)': True, 'contains(but)': True, 'contains())': True}


In [6]:
#obtain feature sets for all movie reviews
featuresets = [(get_document_features(d,word_features), c) for (d,c) in documents]

#split into train and test set (you can experiment with distribution here) 100 - 100 og
train_set, test_set = featuresets[200:], featuresets[:100]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set)) 

classifier.show_most_informative_features(20)


0.2
Most Informative Features
         contains(bacon) = True            Anger : Trust  =     22.5 : 1.0
          contains(hold) = True            Anger : Trust  =     22.5 : 1.0
    contains(production) = True            Anger : Trust  =     22.5 : 1.0
          contains(rare) = True            Anger : Trust  =     22.5 : 1.0
         contains(shift) = True            Anger : Trust  =     22.5 : 1.0
            contains(38) = True             Fear : Trust  =     20.7 : 1.0
            contains(ie) = True             Fear : Trust  =     20.7 : 1.0
           contains(2nd) = True            Anger : Antici =     18.0 : 1.0
         contains(story) = True            Anger : Antici =     18.0 : 1.0
        contains(stated) = True           Surpri : Trust  =     17.7 : 1.0
  contains(intervention) = True             Fear : Antici =     16.6 : 1.0
            contains(bc) = True             Fear : Trust  =     14.8 : 1.0
         contains(crazy) = True             Fear : Trust  =     14.8 :

In [13]:

sample_review = "I think i got rid of my sickness and feel happy"

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))

result of sample review:  Joy


#### loads in Patient Journey labels

In [15]:
import pandas as pd
df_patient = pd.read_excel('Diabetes-classification.xlsx', sheet_name='Patient-journey') # Reads in excel

# Preparing dataset
x_journey = df_patient.loc[:,'discussion_text']
y_journey = df_patient.loc[:,'Label']
# removes all duplicates from list 
Labels_journey = list(dict.fromkeys(y_journey)) 
# puts discussion_text to a str and tokenize it
raw_text_journey = df_patient['discussion_text'].str.cat()
tokens_journey = nltk.word_tokenize(raw_text_journey)
text_journey = nltk.Text(tokens_journey)

In [16]:
# the reviews will be stored as document pairs of words and category
X_list_of_words_journey = [sentence.split(" ") for sentence in x_journey]
documents_journey = list(zip(X_list_of_words_journey, y_journey))

#give random order to the documents
random.shuffle(documents_journey)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents_journey[0:2]:
    feats = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Clinical Treatment
Clinical Treatment
+------------------------------------------+--------------------+
|             Discussion text              |      Emotion       |
+------------------------------------------+--------------------+
| Here,is,a,link,to,a,chart,that,explains, | Clinical Treatment |
| the,different,oral,meds,used,to,treat,Ty |                    |
| pe,2,Diabetes:,http://www.joslin.org/inf |                    |
| o/oral_diabetes_medications_summary_char |                    |
| t.html,You,will,most,likely,be,started,o |                    |
| n,metformin..,If,you,are,worried,about,s |                    |
| tomach,issues,,ask,for,the,extended,rele |                    |
|         ase,(ER),version..,Carol         |                    |
|                                          |                    |
|                                          |                    |
| Health,care,professionals,who,are,intere | Clinical Treatment |
| sted,in,educating,the,public,about,b

In [17]:
print('total words from emotion corpus: ', len(text_journey))

# load all the words in freq distribution
all_words_journey = nltk.FreqDist(w.lower() for w in text_journey)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words_journey = all_words_journey.most_common(6000)
print('most freq words: ', most_freq_words_journey[100:110])

word_features_journey = [word for (word, count) in most_freq_words_journey]
print('word_features[:25]: ', word_features_journey[:25])


total words from emotion corpus:  116721
most freq words:  [('his', 165), ('exercise', 165), ('only', 159), ('she', 159), ('time', 159), ('well', 157), ('any', 157), ('glucose', 156), ('then', 155), ('disease', 154)]
word_features[:25]:  ['..', 'i', 'and', 'the', 'to', 'a', 'of', 'diabetes', '2', 'type', 'is', 'in', 'my', 'that', 'with', 'have', 'for', 'it', 'you', 'was', 'on', 'as', 'are', ')', '.']


In [11]:
def get_document_features_journey(documents_journey, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(documents_journey)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function
#words_doc = movie_reviews.words('pos/cv957_8737.txt')
words_doc = text_journey

feat_dict = get_document_features(words_doc, word_features_journey)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(i)': True, 'contains(..)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(my)': True, 'contains(that)': True, 'contains(have)': True, 'contains(in)': True, 'contains(it)': True, 'contains(with)': True, 'contains(for)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(not)': True, 'contains(but)': True, 'contains())': True}


In [18]:
#obtain feature sets for all movie reviews
featuresets_journey = [(get_document_features_journey(d,word_features_journey), c) for (d,c) in documents_journey]

#split into train and test set (you can experiment with distribution here) 100 - 100 og
train_set_journey, test_set_journey = featuresets_journey[200:], featuresets_journey[:100]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set_journey)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set_journey)) 

classifier.show_most_informative_features(20)


0.41
Most Informative Features
      contains(exercise) = True           Living : Living =     45.0 : 1.0
    contains(colleagues) = True           Altern : Living =     42.0 : 1.0
    contains(treatments) = True           Altern : Living =     35.0 : 1.0
            contains(35) = True           Altern : Living =     30.0 : 1.0
        contains(acidic) = True           Altern : Living =     30.0 : 1.0
    contains(additional) = True           Altern : Living =     30.0 : 1.0
         contains(adobe) = True           Altern : Living =     30.0 : 1.0
        contains(agents) = True           Altern : Living =     30.0 : 1.0
        contains(bitter) = True           Altern : Living =     30.0 : 1.0
        contains(bought) = True           Altern : Living =     30.0 : 1.0
        contains(called) = True           Altern : Living =     30.0 : 1.0
    contains(conclusion) = True           Altern : Living =     30.0 : 1.0
   contains(development) = True           Altern : Living =     30.0 

In [20]:
sample_review = "My doctor told me to start running and go on a diet"

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features_journey(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))

result of sample review:  Living with diabetes - Nutrition
