##### Basic Sentiment Analysis on Movie Reviews Database Using Naive Bayes #####

In [6]:
import os
import nltk
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [7]:
#read the data tekonized
data_dir = os.getcwd() + "/datasets/movie_reviews/" #get the data directory
posids = os.listdir(data_dir+"pos/")
negids = os.listdir(data_dir+"neg/")
df1 = pd.DataFrame({'Ids' : [x + y for x, y in zip(["pos/"]*len(posids), posids)], 'Sentiment' : 1, 'Data' : ''})
df2 = pd.DataFrame({'Ids' : [x + y for x, y in zip(["neg/"]*len(negids), negids)], 'Sentiment' : -1})
data = pd.concat([df1,df2])
data = data.reset_index()
df1 = None; df2 = None
for i in data.index:
    f = open(data_dir + data.Ids[i])
    text = f.read()
    text = nltk.word_tokenize(text) #tokenize the data
    data.set_value(i, 'Data', text)

In [8]:
def remove_from_list(the_list, val):
   return [value.lower() for value in the_list if value != val] #remove val from the list as well as lowercase all the strings

def get_wordnet_pos(treebank_tag):      #get the equivalent of pos tags in wordnet to use with the wordnet lemmatizer
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
#convert words like 're to are and n't to not
#remove things like (, ), -, ', " and :
#lowercase everything
#get the figures of speech, lemmatize based on it

for ind in data.index:
    text = data.Data[ind]
    symbols = ['(', ')', '-', '\'', '\"', ':', '``']
    for symbol in symbols: #remove the aforementioned symbols
           text = remove_from_list(text, symbol)

    postags = nltk.pos_tag(text)

    for i in range(0, len(text)):
        if text[i] == "n't":
            text[i] = 'not'
        elif postags[i][1].startswith('V'): #if it's a verb
            if text[i] == "'s":
                text[i] = 'is'
            elif text[i] == "'re":
                text[i] = 'are'
            elif text[i] == "'ve":
                text[i] = 'have'

    for i in range(0, len(text)):  #lemmatize
        lmtzr = WordNetLemmatizer()
        text[i] = lmtzr.lemmatize(text[i], get_wordnet_pos(postags[i][1]))
    
#     temp = -1
#     for i in range(0, len(text)):  #deal with not
#         if text[i] == 'not':
#             temp = i;
#         while temp > 0:
#             if (temp < len(text)-1):
#                 if (text[temp+1] != ',' and text[temp+1] != '.' and text[temp+1] != ';' and text[temp+1] != '?'):
#                     text[temp+1] = 'NOT_' + text[temp+1]
#                     temp = temp+1
#                 else:
#                     temp = -1
#             else:
#                 temp = -1
    
    data.set_value(ind, 'Data', text)

In [10]:
#construct the features vector

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

def word_feats(words):
    return dict([(word, True) for word in words])
 
feats = [(word_feats(data.Data[ind]), data.Sentiment[ind]) for ind in data.index]

In [11]:
#divide the data into training set and test set
cutoff = len(feats)*3/8
#np.random.shuffle(feats)

trainfeats = feats[:cutoff] + feats[1000:1000+cutoff]
testfeats = feats[cutoff:1000] + feats[1000+cutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

train on 1500 instances, test on 500 instances


In [12]:
#apply the Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(trainfeats)
print 'training accuracy:', round(nltk.classify.util.accuracy(classifier, trainfeats)*100,2)
print 'test accuracy:', round(nltk.classify.util.accuracy(classifier, testfeats)*100,2)
#classifier.show_most_informative_features()

training accuracy: 97.73
test accuracy: 70.6
