#### imports for the project

In [1]:
import nltk
import random
from prettytable import PrettyTable
import textwrap 
import numpy as np
import string
from nltk.corpus import stopwords

from nltk.classify import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Import excel with pandas Emotions

In [2]:
import pandas as pd
df_emotions = pd.read_excel('Diabetes-classification.xlsx', sheet_name ='Emotions')

# Preparing dataset
x_emotion = df_emotions.loc[:,'discussion_text']
y_emotion = df_emotions.loc[:,'Label']

# removes all duplicates from list 
Labels_emotion = list(dict.fromkeys(y_emotion))

#Remove stopwords
lim_punc = [char for char in string.punctuation if char in "&#^_"]
nopunc = [char for char in x_emotion if char not in lim_punc]
nopunc = ''.join(nopunc)

other_stop=['•','...in','...the','...you\'ve','–','—','-','⋆','...','....','..','C.','c','|','...The','...The','...When','...A','C','+','1','2','3','4','5','6','7','8','9','10', '2016',  'speak','also', 'seen','[5].',  'using', 'get',  'instead',  "that's",  '......','may', 'e', '...it', 'puts', '...over', '[✯]','happens', "they're",'hwo',  '...a', 'called',  '50s','c;', '20',  'per', 'however,','it,', 'yet', 'one', 'bs,', 'ms,', 'sr.',  '...taking',  'may', '...of', 'course,', 'get', 'likely', 'no,']

ext_stopwords=stopwords.words('english')+other_stop
clean_words = [word for word in nopunc.split() if word.lower() not in ext_stopwords]
# puts discussion_text to a str and tokenize it
#raw_text_emotion = df_emotions['discussion_text'].str.cat()
raw_text_emotion = df_emotions['discussion_text'].str.cat()

tokens_emotion = nltk.word_tokenize(raw_text_emotion)
tokens_emotion_filtered = [clean_words for clean_words in tokens_emotion if clean_words]
text_emotion = nltk.Text(tokens_emotion_filtered)

#### Multinominal NB classifer for Emotions

In [3]:
# the reviews will be stored as document pairs of words and category
X_list_of_words = [sentence.split(" ") for sentence in x_emotion]
documents = list(zip(X_list_of_words, y_emotion))

#give random order to the documents
random.Random(5).shuffle(documents)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents[0:2]:
    feats = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Anticipation
Surprise
+------------------------------------------+--------------+
|             Discussion text              |   Emotion    |
+------------------------------------------+--------------+
| ,@GuidoGuy202,Cutting,back,on,carbs,shou | Anticipation |
| ld,help,you,manage,your,diabetes,better, |              |
| but,as,a,t1,on,insulin,you,have,to,be,ca |              |
| reful,to,do,this,gradually,reducing,your |              |
| ,insulin,as,you,go,along,otherwise,you,r |              |
| un,the,risk,of,hypoglycaemic,attacks..,T |              |
|       here,are,a,number,of,t1's,on       |              |
|                                          |              |
|                                          |              |
| Hi,there,,I,have,had,type,2,diabetes,for |   Surprise   |
| ,about,20,years..,Not,always,in,control, |              |
|    ..,but,doing,better,with,insulin.     |              |
|                                          |              |
|                 

In [4]:
print('total words from emotion corpus: ', len(text_emotion))

# load all the words in freq distribution
all_words = nltk.FreqDist(w.lower() for w in text_emotion)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words = all_words.most_common(6000)
print('most freq words: ', most_freq_words[100:110])

word_features = [word for (word, count) in most_freq_words]
print('word_features[:25]: ', word_features[:25])


total words from emotion corpus:  276377
most freq words:  [('think', 398), ('other', 398), ('does', 393), ('many', 389), ('only', 387), ('day', 385), ('time', 383), ('much', 380), ('help', 379), ('risk', 376)]
word_features[:25]:  ['i', '..', 'and', 'the', 'to', 'a', 'of', 'diabetes', '2', 'type', 'is', 'my', 'that', 'have', 'in', 'it', 'with', 'for', 'you', 'was', 'on', 'as', 'not', 'but', ')']


In [5]:
def get_document_features(document, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(document)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


# test code for the above function
#words_doc = movie_reviews.words('pos/cv957_8737.txt')
words_doc = text_emotion

feat_dict = get_document_features(words_doc, word_features)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(i)': True, 'contains(..)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(my)': True, 'contains(that)': True, 'contains(have)': True, 'contains(in)': True, 'contains(it)': True, 'contains(with)': True, 'contains(for)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(not)': True, 'contains(but)': True, 'contains())': True}


In [6]:
#obtain feature set
featuresets = [(get_document_features(d,word_features), c) for (d,c) in documents]

#split into train and test set (you can experiment with distribution here) 100 - 100 og
train_set, test_set = featuresets[100:1000], featuresets[:100]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set)) 

classifier.show_most_informative_features(20)


0.32
Most Informative Features
          contains(half) = True            Anger : Trust  =     22.2 : 1.0
         contains(bacon) = True            Anger : Antici =     16.3 : 1.0
      contains(glycemic) = True            Anger : Antici =     16.3 : 1.0
     contains(increases) = True            Anger : Antici =     16.3 : 1.0
        contains(joined) = True            Anger : Antici =     16.3 : 1.0
        contains(pretty) = True            Anger : Trust  =     13.3 : 1.0
      contains(response) = True            Anger : Trust  =     13.3 : 1.0
         contains(stone) = True            Anger : Trust  =     13.3 : 1.0
        contains(type-2) = True            Anger : Trust  =     13.3 : 1.0
            contains(34) = True            Anger : Trust  =     13.3 : 1.0
            contains(44) = True            Anger : Trust  =     13.3 : 1.0
            contains(48) = True            Anger : Trust  =     13.3 : 1.0
        contains(agency) = True            Anger : Trust  =     13.3 

In [11]:
from collections import defaultdict
refsets = defaultdict(set)
testsets = defaultdict(set)
labels = []
tests = []
for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
    labels.append(label)
    tests.append(observed)

print(nltk.ConfusionMatrix(labels, tests))

             |     A                   |
             |     n                   |
             |     t                   |
             |     i                   |
             |     c              S    |
             |     i  D        S  u    |
             |     p  i        a  r    |
             |  A  a  s        d  p  T |
             |  n  t  g  F     n  r  r |
             |  g  i  u  e  J  e  i  u |
             |  e  o  s  a  o  s  s  s |
             |  r  n  t  r  y  s  e  t |
-------------+-------------------------+
       Anger | <.> .  .  .  .  .  .  1 |
Anticipation |  . <9> .  .  1  2  . 20 |
     Disgust |  .  5 <.> .  .  .  .  5 |
        Fear |  .  2  . <.> .  .  .  1 |
         Joy |  .  1  .  1 <.> .  .  5 |
     Sadness |  .  3  .  .  . <.> .  6 |
    Surprise |  .  2  .  .  .  . <.> 3 |
       Trust |  .  9  2  .  .  2  .<20>|
-------------+-------------------------+
(row = reference; col = test)



In [7]:

sample_review = "My sickness got worse, and the doctors won't do anything"

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))

result of sample review:  Joy


#### loads in Patient Journey labels

In [7]:
import pandas as pd
df_patient = pd.read_excel('Diabetes-classification.xlsx', sheet_name='Patient-journey') # Reads in excel

# Preparing dataset
x_journey = df_patient.loc[:,'discussion_text']
y_journey = df_patient.loc[:,'Label']
# removes all duplicates from list 
Labels_journey = list(dict.fromkeys(y_journey)) 
#stopwords
lim_punc_patient = [char for char in string.punctuation if char in "&#^_"]
nopunc_patient = [char for char in x_journey if char not in lim_punc_patient]
nopunc_patient = ''.join(nopunc_patient)

ext_stopwords_patient=stopwords.words('english')+other_stop
clean_words = [word for word in nopunc_patient.split() if word.lower() not in ext_stopwords_patient]

# puts discussion_text to a str and tokenize it
raw_text_journey = df_patient['discussion_text'].str.cat()
tokens_journey = nltk.word_tokenize(raw_text_journey)
tokens_emotion_filtered = [clean_words for clean_words in tokens_journey if clean_words.isalnum()]
text_journey = nltk.Text(tokens_journey)

In [8]:
# the reviews will be stored as document pairs of words and category
X_list_of_words_journey = [sentence.split(" ") for sentence in x_journey]
documents_journey = list(zip(X_list_of_words_journey, y_journey))

#give random order to the documents
random.shuffle(documents_journey)

tab = PrettyTable(['Discussion text', 'Emotion'])
tab.horizontal_char = '-'

for (doc, cat) in documents_journey[0:2]:
    feats_journey = textwrap.fill(','.join(doc[:50]), width=40)
    tab.add_row([ feats_journey, cat])
    tab.add_row([ '\n', '\n'])
    print(cat)

print(tab)


Living with diabetes - Exercise
Undiagnosed
+------------------------------------------+---------------------------------+
|             Discussion text              |             Emotion             |
+------------------------------------------+---------------------------------+
| 38,Help,-,I,am,really,struggling,to,cont | Living with diabetes - Exercise |
| rol,my,Type,2,with,diet,alone,,and,am,in |                                 |
| ,a,wheelchair,so,can't,exercise,much..,M |                                 |
| y,last,Hba1c,was,36,(5.4)..,I,know,this, |                                 |
| is,a,good,reading,,but,my,weight,keeps,g |                                 |
|             oing,up,even,so.             |                                 |
|                                          |                                 |
|                                          |                                 |
| His,claims,to,curing,diabetes,aren't,bac |           Undiagnosed           |
| ked,up

In [9]:
print('total words from emotion corpus: ', len(text_journey))

# load all the words in freq distribution
all_words_journey = nltk.FreqDist(w.lower() for w in text_journey)

#construct a list of the 2000 most frequent words in the overall corpus (you can try with other numbers as well)
most_freq_words_journey = all_words_journey.most_common(6000)
print('most freq words: ', most_freq_words_journey[100:110])

word_features_journey = [word for (word, count) in most_freq_words_journey]
print('word_features[:25]: ', word_features_journey[:25])


total words from emotion corpus:  116721
most freq words:  [('his', 165), ('exercise', 165), ('only', 159), ('she', 159), ('time', 159), ('well', 157), ('any', 157), ('glucose', 156), ('then', 155), ('disease', 154)]
word_features[:25]:  ['..', 'i', 'and', 'the', 'to', 'a', 'of', 'diabetes', '2', 'type', 'is', 'in', 'my', 'that', 'with', 'have', 'for', 'it', 'you', 'was', 'on', 'as', 'are', ')', '.']


In [10]:
def get_document_features_journey(documents_journey, doc_features):
    """
        This function will convert given document into a feature set.
        Note that we need to add the feature set that is relevant to the document we are inputting
        
    """
    #checking whether a word occurs in a set is much faster than checking whether it occurs in a list 
    document_words = set(documents_journey)
    features = {}
    
    #the feaures dict will consist of words as keys and boolean value of whether they exist in the document
    for word in doc_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

words_doc = text_journey

feat_dict = get_document_features(words_doc, word_features_journey)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}
print('transformed document features, printing the first 25 features \n\n', feat_dict_25)



transformed document features, printing the first 25 features 

 {'contains(..)': True, 'contains(i)': True, 'contains(and)': True, 'contains(the)': True, 'contains(to)': True, 'contains(a)': True, 'contains(of)': True, 'contains(diabetes)': True, 'contains(2)': True, 'contains(type)': True, 'contains(is)': True, 'contains(in)': True, 'contains(my)': True, 'contains(that)': True, 'contains(with)': True, 'contains(have)': True, 'contains(for)': True, 'contains(it)': True, 'contains(you)': True, 'contains(was)': True, 'contains(on)': True, 'contains(as)': True, 'contains(are)': True, 'contains())': True, 'contains(.)': True}


In [11]:
#obtain feature sets for all movie reviews
featuresets_journey = [(get_document_features_journey(d,word_features_journey), c) for (d,c) in documents_journey]

#split into train and test set (you can experiment with distribution here) 100 - 100 og
train_set_journey, test_set_journey = featuresets_journey[100:1000], featuresets_journey[:100]

#instantiate classifier
classifier = nltk.NaiveBayesClassifier.train(train_set_journey)

#print accuracy and most informative features
print(nltk.classify.accuracy(classifier, test_set_journey)) 

classifier.show_most_informative_features(20)


0.5
Most Informative Features
      contains(exercise) = True           Living : Living =     50.2 : 1.0
       contains(ability) = True           Altern : Living =     25.9 : 1.0
        contains(adding) = True           Altern : Living =     25.9 : 1.0
         contains(adobe) = True           Altern : Living =     25.9 : 1.0
        contains(agents) = True           Altern : Living =     25.9 : 1.0
      contains(approved) = True           Altern : Living =     25.9 : 1.0
        contains(bitter) = True           Altern : Living =     25.9 : 1.0
        contains(bloods) = True           Altern : Living =     25.9 : 1.0
        contains(charge) = True           Altern : Living =     25.9 : 1.0
    contains(colleagues) = True           Altern : Living =     25.9 : 1.0
      contains(download) = True           Altern : Living =     25.9 : 1.0
       contains(failing) = True           Altern : Living =     25.9 : 1.0
          contains(file) = True           Altern : Living =     25.9 :

In [19]:
from collections import defaultdict
refsets = defaultdict(set)
testsets = defaultdict(set)
labels = []
tests = []
for i, (feats, label) in enumerate(test_set_journey):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
    labels.append(label)
    tests.append(observed)

print(nltk.ConfusionMatrix(labels, tests))

                                                           |           L          |
                                                           |           i          |
                                                           |           v          |
                                                           |           i          |
                                                           |           n          |
                                                           |           g          |
                                                           |                      |
                                                           |           w          |
                                                           |           i          |
                                                           |           t          |
                                                           |           h          |
                                                           |                

In [13]:
sample_review = "My doctor told me to start running and go on a diet"

#get features specific to the input text
sample_features = {word:True for word in sample_review.split()}


sample_review_doc_feats = get_document_features_journey(sample_review.split(),sample_features)


#print('Sample review features: \n\n',sample_review_doc_feats)

print('result of sample review: ', classifier.classify(sample_review_doc_feats))

result of sample review:  Living with diabetes - Nutrition
