# Sentiment_analysis_example_GRA

### Jinghong Zeng

This document introduces how to build a supervised model for binary sentiment classfication on movie reviews.

## Import data and functions

In [1]:
from nltk.corpus import movie_reviews as mr, stopwords # mr is data
from nltk.classify import NaiveBayesClassifier # classifier
from nltk.probability import FreqDist
from nltk.sentiment.util import *
from itertools import chain, islice
import string
import pandas as pd

## Manipulate data

In [2]:
# An example movie review
example = open('cv000_29416.txt', "r")
{"review": example.read(800).replace("\n", ""), 
 "tokens": mr.words('neg/cv000_29416.txt'), 
 "category": mr.categories('neg/cv000_29416.txt')}

{'review': 'plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what\'s the deal ? watch the movie and " sorta " find out . . . critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems',
 'tokens': ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...],
 'category': ['neg']}

In [3]:
example.close()

In [4]:
mr.categories()

['neg', 'pos']

In [5]:
# Create a list of (words of review, category of review)
# Remove stop words and punctuations
# Stop words are frequent words with no significant meaning, 
# eg., pronouns, articles
stop = stopwords.words('english')
document = [([w.lower() for w in mr.words(file_id) if w.lower() not in stop and w.lower() not in string.punctuation+"--"], category) for file_id in mr.fileids() for category in mr.categories(file_id)]
(document[0][0][:10], document[0][1])

(['plot',
  'two',
  'teen',
  'couples',
  'go',
  'church',
  'party',
  'drink',
  'drive',
  'get'],
 'neg')

## Create features

In [6]:
word_freq = FreqDist(chain(*[i for i,j in document]))
word_freq_sorted = dict(sorted(word_freq.items(), key=lambda item: item[1], 
                               reverse = True))
len(word_freq_sorted.keys())

39585

In [7]:
# Pick 500 most frequent words as features
features = list(word_freq_sorted.keys())[:500]

In [8]:
features[:10]

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much']

In [9]:
# Create features in data
# Each feature is a binary variable, True - in this review, False - not in this review
# Label is category
document_pos = [({i:(i in tokens) for i in features}, category) 
                for tokens, category in document if category == "pos"]
document_neg = [({i:(i in tokens) for i in features}, category) 
                for tokens, category in document if category == "neg"]

In [10]:
dict(islice(document_pos[0][0].items(), 10))

{'film': True,
 'one': True,
 'movie': False,
 'like': True,
 'even': True,
 'good': True,
 'time': True,
 'story': False,
 'would': True,
 'much': True}

In [11]:
# Create training and test data by 9:1
train_pos = document_pos[:int(len(document_pos)*0.9)]
test_pos = document_pos[int(len(document_pos)*0.9):]
train_neg = document_neg[:int(len(document_neg)*0.9)]
test_neg = document_neg[int(len(document_neg)*0.9):]
train_data = train_pos + train_neg
test_data = test_pos + test_neg

## Train a model

In [12]:
# Use Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_data)

## Predict

In [13]:
prediction = [classifier.classify(x[0]) for x in test_data]
result = pd.DataFrame(test_data, columns=['Features', 'Label'])
result["Prediction"] = prediction
result

Unnamed: 0,Features,Label,Prediction
0,"{'film': True, 'one': True, 'movie': True, 'li...",pos,pos
1,"{'film': True, 'one': True, 'movie': True, 'li...",pos,neg
2,"{'film': True, 'one': True, 'movie': True, 'li...",pos,pos
3,"{'film': True, 'one': True, 'movie': True, 'li...",pos,pos
4,"{'film': True, 'one': False, 'movie': True, 'l...",pos,neg
...,...,...,...
195,"{'film': True, 'one': True, 'movie': False, 'l...",neg,neg
196,"{'film': True, 'one': True, 'movie': True, 'li...",neg,neg
197,"{'film': True, 'one': True, 'movie': True, 'li...",neg,neg
198,"{'film': True, 'one': True, 'movie': False, 'l...",neg,neg


## Evaluate

In [14]:
# Accuracy
nltk.classify.accuracy(classifier, test_data)

0.785

In [15]:
# 10 most informative features
classifier.show_most_informative_features(10)

Most Informative Features
                   worst = True              neg : pos    =      4.2 : 1.0
                  stupid = True              neg : pos    =      4.2 : 1.0
                  boring = True              neg : pos    =      3.3 : 1.0
                   worse = True              neg : pos    =      2.4 : 1.0
                supposed = True              neg : pos    =      2.3 : 1.0
                 perfect = True              pos : neg    =      2.3 : 1.0
                   oscar = True              pos : neg    =      2.2 : 1.0
                    none = True              neg : pos    =      2.1 : 1.0
                  others = True              pos : neg    =      2.0 : 1.0
                  strong = True              pos : neg    =      2.0 : 1.0
