# A Naive Implementation of a Naive Bayes Classifier for Sentiment Analysis of WebMD Drug Reviews

The NLTK NB Classifier was used in this work.  Code was modified from the tutorial at https://www.twilio.com/blog/2017/09/sentiment-analysis-python-messy-data-nltk.html for this script.

In [3]:
## Import packages we need

import string
from nltk.classify import NaiveBayesClassifier
import nltk.classify.util
import nltk
import math
import csv

In [4]:
## define needed functions

def format_sentence(sent, stopwords=None):
    filtered_words = []
    # convert to lowercase
    sent = sent.translate(str.maketrans("", "", string.punctuation)).lower()
    #remove stopwords
    if stopwords is not None:
        com_list = sent.split()
        for word in com_list:
            if word not in stopwords:
                filtered_words.append(word)
        #sent = ' '.join(filtered_words)
    
    #return({word: True for word in nltk.word_tokenize(sent)})
    return({word: True for word in filtered_words})


In [5]:
### Import reviews from csv file

my_list = []
with open('citalopram_effectivness.csv') as commentfile:
    reader = csv.DictReader(commentfile)
    for row in reader:
        my_list.append({'comment': row['comment'], 'rating': row['rating']})

In [6]:
## split reviews up into a positive, negative, and neutral list

pos_list=[]
neg_list=[]
neu_list=[]
for c in my_list:
    tmp_com = c['comment']
    tmp_rating = c['rating']

    #remove stop words
    with open('./stopwords_long') as raw:
        stopwords = raw.read().translate(str.maketrans("", "", string.punctuation)).splitlines()
        
        if tmp_com != '':
            if tmp_rating in ['1','2']:
                neg_list.append((format_sentence(tmp_com, stopwords), 'neg'))
            elif tmp_rating in ['4','5']:
                pos_list.append((format_sentence(tmp_com, stopwords), 'pos'))
            else:
                neu_list.append(tmp_com)

    
print("Neg:"+str(len(neg_list))+"\nPos:"+str(len(pos_list))+"\nNeutral:"+str(len(neu_list)))
pos_list[0] 


Neg:152
Pos:431
Neutral:146


({'fact': True,
  'great': True,
  'other': True,
  'sleepy': True,
  'than': True,
  'very': True},
 'pos')

In [7]:
## print neutral entries to a file

#with open('neutral.txt', 'w') as output_file:
#    for i in neu_list:
#        output_file.write(i+"\n")

In [16]:
### create training and test sets

## set the cutoffs
negcutoff = math.floor(len(neg_list)*3/4)
poscutoff = math.floor(len(pos_list)*3/4)

train = neg_list[:negcutoff] + pos_list[:poscutoff]
test = neg_list[negcutoff:] + pos_list[poscutoff:]
print('train on %d instances, test on %d instances' % (len(train), len(test)))
print('negcutoff %d instances, poscutoff %d instances' % (negcutoff, poscutoff))



train on 437 instances, test on 146 instances
negcutoff 114 instances, poscutoff 323 instances


In [20]:
## creates randomly chosen training and test sets

import random
neg_idx_train = sorted(random.sample(range(len(neg_list)), negcutoff))
neg_train = [neg_list[i] for i in neg_idx_train]

neg_idx_test = set(range(len(neg_list))) - set(neg_idx_train)
neg_test = [neg_list[i] for i in neg_idx_test]


pos_idx_train = sorted(random.sample(range(len(pos_list)), poscutoff))
pos_train = [pos_list[i] for i in pos_idx_train]

pos_idx_test = set(range(len(pos_list))) - set(pos_idx_train)
pos_test = [pos_list[i] for i in pos_idx_test]

train = neg_train + pos_train
test = neg_test + pos_test

wtrain = csv.writer(open ('citalopram_train.csv', 'w'), delimiter=',', lineterminator='\n')
for label in train:
    wtrain.writerows([label])

wtest = csv.writer(open ('citalopram_test.csv', 'w'), delimiter=',', lineterminator='\n')
for label in test:
    wtest.writerows([label])


print('train on %d instances, test on %d instances' % (len(train), len(test)))
print('neg_idx_train %d instances, pos_idx_train %d instances' % (len(neg_idx_train), len(pos_idx_train)))

train on 437 instances, test on 146 instances
neg_idx_train 114 instances, pos_idx_train 323 instances


In [18]:
classifier = NaiveBayesClassifier.train(train)

In [19]:
print('accuracy:', nltk.classify.util.accuracy(classifier, test))


accuracy: 0.7328767123287672


In [13]:
classifier.show_most_informative_features()

Most Informative Features
                 stomach = True              neg : pos    =     10.3 : 1.0
                cymbalta = True              neg : pos    =      6.6 : 1.0
                  hoping = True              neg : pos    =      6.6 : 1.0
                    quit = True              neg : pos    =      6.2 : 1.0
                   since = True              pos : neg    =      5.6 : 1.0
                   years = True              pos : neg    =      5.4 : 1.0
                terrible = True              neg : pos    =      5.1 : 1.0
              absolutely = True              neg : pos    =      5.1 : 1.0
                   happy = True              pos : neg    =      4.8 : 1.0
                   bouts = True              neg : pos    =      4.7 : 1.0


In [38]:
## Import neutral reviews

with open('neutral.txt') as file:
    toclass = file.readlines()
len(toclass)

154

## Examples of possible mis-classifications

In [33]:
## Negation
print(toclass[9])
print(format_sentence(toclass[9], stopwords))
print('Classification: '+classifier.classify(format_sentence(toclass[9], stopwords)))

I can see not improvement

{'improvement': True}
Classification: pos


In [35]:
## Semantics
print(toclass[29])
print(format_sentence(toclass[29], stopwords))
print('Classification: '+classifier.classify(format_sentence(toclass[29], stopwords)))

still very depressed.

{'very': True, 'depressed': True}
Classification: pos


In [36]:
## Confusion of symptoms and end outcome
print(toclass[32])
print(format_sentence(toclass[32], stopwords))
print('Classification: '+classifier.classify(format_sentence(toclass[32], stopwords)))

I have had rapid heart beat ,loss of appetite ,weight loss ,loss of sleep ,stomach pains, cramps in my feet or muscle cramps, frequent urination,  but it has helped my mood

{'rapid': True, 'heart': True, 'beat': True, 'loss': True, 'appetite': True, 'weight': True, 'sleep': True, 'stomach': True, 'pains': True, 'cramps': True, 'feet': True, 'or': True, 'muscle': True, 'frequent': True, 'urination': True, 'helped': True, 'mood': True}
Classification: neg


In [37]:
## Neutral question, not opinion
print(toclass[55])
print(format_sentence(toclass[55], stopwords))
print('Classification: '+classifier.classify(format_sentence(toclass[55], stopwords)))

does this cause mild stomach pain?

{'mild': True, 'stomach': True, 'pain': True}
Classification: neg


## Other examples
The second and third actually change their predicted class depending on whether or not you remove stopwords.

In [26]:
sent1 = "I took the medication for depression, it did a great job, however I was always tired. I couldn't stay awake on my job, couldn't stay awake while driving. I was just putting myself in harm by taking that medicine. The sleepiness, never went away."
print(classifier.classify(format_sentence(sent1, stopwords)))

pos


In [27]:
sent2 = "Can't stay awake always tired and doesn't seem to work for me I've had up to 40mg need to try a different tablet"
print(classifier.classify(format_sentence(sent2, stopwords)))

neg


In [28]:
sent3 = "I recently had to kick my son out of the house. I could hardly bare it. I couldn't get out of bed and cried every day. I was talked into taking Celexa by my DR. After 3-4 weeks, I did stop crying and could get out of bed. But seeing a counselor helped me the most and have come to terms with my crisis event. After 2.5 months, I have stopped taking Celexa. I talked to my pharmacist and the side effects (for me) were multiple and harsh. Nausea, dizziness, diarrhea and the worst was the trembling in my hands. I thought I was getting Parkinsons! Good luck to all with finding your happiness."
print(classifier.classify(format_sentence(sent3, stopwords)))

neg


## Domain Swap
The classifier above was trained on Citalopram drug reviews.  Lets test out how it will perform on Gilenya drug reviews.

In [40]:
## Import Gilenya reviews

gilenya_list = []
with open('gilenya_effectivness.csv') as commentfile:
    reader = csv.DictReader(commentfile)
    for row in reader:
        gilenya_list.append({'comment': row['comment'], 'rating': row['rating']})
len(gilenya_list)

75

In [47]:
d_list = []
for c in range(len(gilenya_list)):
    tmp_c = gilenya_list[c]['comment']
    tmp_r = gilenya_list[c]['rating']
        
    if tmp_r in ['1','2']:
        d_list.append((format_sentence(tmp_c, stopwords), 'neg'))
    if tmp_r in ['3','4','5']:
        d_list.append((format_sentence(tmp_c, stopwords), 'pos'))
        
nltk.classify.util.accuracy(classifier, d_list)
       

0.8

Interestingly, the classifier performs better on the Gilenya reviews than it did on the Citalopram.