## Background

This project will perform sentiment analysis on statements to determine if they are positive or negative reviews of a movie. 

The dataset being used to train and test is a subset of the Stanford Sentiment Treebank containing 400 phrases which are split between equally between positive and negative.


## 1. Load training data

In [None]:
# This section is for loading the training data from the google drive directory
import os

from google.colab import drive
drive.mount('/content/drive')
dir_root = '/content/drive/MyDrive/Colab Notebooks/Assignment_1'
train_file = os.path.join(dir_root, 'train.txt')

In [3]:
# load training data
f = open('train.txt', 'r') # replace train.txt with train_file if using similar directory setup
train_lines = f.readlines()
f.close()

# split the data into data and labels
train_data, train_label = [], []
for i in train_lines:
  temp = i.split('|')
  train_data.append(temp[0])
  train_label.append(int(temp[1]))

#preview some data here (for testing)
preview = 10 # change preview # to see more or less data
for i in range(preview):
    print(f'Phrase \"{train_data[i]}\" has the sentiment {train_label[i]}')

Phrase "Astonishingly skillful and moving" has the sentiment 1
Phrase "are incredibly beautiful to look at" has the sentiment 1
Phrase "as the most magical and most fun family fare of this or any recent holiday season" has the sentiment 1
Phrase "It shows that some studios firmly believe that people have lost the ability to think and will forgive any shoddy product as long as there 's a little girl-on-girl action ." has the sentiment -1
Phrase "Will assuredly rank as one of the cleverest , most deceptively amusing comedies of the year ." has the sentiment 1
Phrase "disintegrates into a dreary , humorless soap opera" has the sentiment -1
Phrase "The editing is chaotic , the photography grainy and badly focused , the writing unintentionally hilarious , the direction unfocused ," has the sentiment -1
Phrase "The film is often filled with a sense of pure wonderment and excitement not often seen in today 's cinema du sarcasm" has the sentiment 1
Phrase "is as appalling as any ` comedy ' to 

## 2. Manually inspect word frequencies

In [4]:
def word_frequency(data, label):
    words = dict()
    for i in range(len(data)):
      temp = data[i].split(" ")
      for j in temp:
        if j.lower() in words: # if the word is already in the dictionary
          words[j.lower()]['total'] += 1
          if label[i] == 1:
            words[j.lower()]['pos'] += 1
          else:
            words[j.lower()]['neg'] += 1
        else:
          if label[i] == 1: # if the word is not in the dictionary
            words[j.lower()] = {'total': 1, 'pos': 1, 'neg': 0}
          else:
            words[j.lower()] = {'total': 1, 'pos': 0, 'neg': 1}
    return words

words =  word_frequency(train_data, train_label)
print('total' in words['best'] and 'pos' in words['best'] and 'neg' in words['best']) # True
print(words['best']) # {'total': 12, 'pos': 12, 'neg': 0}

print('Top 20 positive words (largest ratio in positive posts)')
display(sorted(words.items(), key=lambda x: (x[1]['pos']/x[1]['total'], x[1]['total']), reverse=True)[:20])
# [('best', {'total': 12, 'pos': 12, 'neg': 0}),
# ('brilliant', {'total': 6, 'pos': 6, 'neg': 0}), ...
print('Top 20 negative words (largest ratio in negative posts)')
display(sorted(words.items(), key=lambda x: (x[1]['neg']/x[1]['total'], x[1]['total']), reverse=True)[:20])
# [('i', {'total': 11, 'pos': 0, 'neg': 11}),
#  ('bad', {'total': 10, 'pos': 0, 'neg': 10}), ...

True
{'total': 12, 'pos': 12, 'neg': 0}
Top 20 positive words (largest ratio in positive posts)


[('best', {'total': 12, 'pos': 12, 'neg': 0}),
 ('brilliant', {'total': 6, 'pos': 6, 'neg': 0}),
 ('films', {'total': 6, 'pos': 6, 'neg': 0}),
 ('work', {'total': 5, 'pos': 5, 'neg': 0}),
 ('first', {'total': 4, 'pos': 4, 'neg': 0}),
 ('love', {'total': 4, 'pos': 4, 'neg': 0}),
 ('their', {'total': 4, 'pos': 4, 'neg': 0}),
 ('recent', {'total': 3, 'pos': 3, 'neg': 0}),
 ('often', {'total': 3, 'pos': 3, 'neg': 0}),
 ('easily', {'total': 3, 'pos': 3, 'neg': 0}),
 ('performances', {'total': 3, 'pos': 3, 'neg': 0}),
 ('deserves', {'total': 3, 'pos': 3, 'neg': 0}),
 ('them', {'total': 3, 'pos': 3, 'neg': 0}),
 ('funny', {'total': 3, 'pos': 3, 'neg': 0}),
 ('well', {'total': 3, 'pos': 3, 'neg': 0}),
 ('triumph', {'total': 3, 'pos': 3, 'neg': 0}),
 ('adventure', {'total': 3, 'pos': 3, 'neg': 0}),
 ('moving', {'total': 2, 'pos': 2, 'neg': 0}),
 ('beautiful', {'total': 2, 'pos': 2, 'neg': 0}),
 ('season', {'total': 2, 'pos': 2, 'neg': 0})]

Top 20 negative words (largest ratio in negative posts)


[('i', {'total': 11, 'pos': 0, 'neg': 11}),
 ('bad', {'total': 10, 'pos': 0, 'neg': 10}),
 ('worst', {'total': 6, 'pos': 0, 'neg': 6}),
 ('my', {'total': 4, 'pos': 0, 'neg': 4}),
 ('if', {'total': 4, 'pos': 0, 'neg': 4}),
 ('when', {'total': 4, 'pos': 0, 'neg': 4}),
 ('product', {'total': 3, 'pos': 0, 'neg': 3}),
 ('into', {'total': 3, 'pos': 0, 'neg': 3}),
 ('unlikable', {'total': 3, 'pos': 0, 'neg': 3}),
 ('utterly', {'total': 3, 'pos': 0, 'neg': 3}),
 ('pathetic', {'total': 3, 'pos': 0, 'neg': 3}),
 ('dull', {'total': 3, 'pos': 0, 'neg': 3}),
 (':', {'total': 3, 'pos': 0, 'neg': 3}),
 ('pointless', {'total': 3, 'pos': 0, 'neg': 3}),
 ('character', {'total': 3, 'pos': 0, 'neg': 3}),
 ('ugly', {'total': 3, 'pos': 0, 'neg': 3}),
 ('theater', {'total': 3, 'pos': 0, 'neg': 3}),
 ('he', {'total': 3, 'pos': 0, 'neg': 3}),
 ('barely', {'total': 3, 'pos': 0, 'neg': 3}),
 ('people', {'total': 2, 'pos': 0, 'neg': 2})]

## 3. Create the classifier

The handcrafted classifier for the sentiment analysis model was created through trial and error using the evaluate function with the train data.

In [7]:
def sentiment_analysis_model(phrase):
    # handcrafted classifier
    pos = ['best', 'brilliant', 'films', 'work', 'first', 'love', 'recent', 'often', 'easily', 'performances', 'deserves', 'funny', 'well', 'triumph', 'moving', 'beautiful', 'season', 'dazzling']
    neg = ['i', 'if', 'bad', 'worst', 'unlikable', 'utterly', 'pathetic', 'dull', 'pointless', 'ugly', 'barely', 'incompetent', 'waste', 'product', 'annoying', 'stale', 'shoddy']

    for i in pos:
      if i in phrase.lower(): return 1
    for i in neg:
      if i in phrase.lower(): return -1

    return 1 # default to positive

def evaluate(func, data, label):
    # evaluate the accuracy of the model passed in func
    correct = 0
    total = 0
    for i in range(len(data)):
      if func(data[i]) == label[i]:
        correct += 1
        total += 1
      else:
        total += 1

    accuracy = correct / total
    return accuracy

# use training data to evaluate the model
train_acc = evaluate(sentiment_analysis_model, train_data, train_label)
print(f"Your method has the training accuracy of {train_acc*100}%")

Your method has the training accuracy of 78.5%


## 4. Evaluate

In [None]:
# This section is for loading the test data from the google drive directory
import sys
sys.path.append(dir_root)
test_dir = os.path.join(dir_root, 'test.npy')

In [8]:
import numpy as np

# function that compares model output to test data
def test(test_dir, func):

    (test_data, test_label) = np.load(test_dir)

    score = 0
    for idx in range(len(test_data)):
        if int(func(test_data[idx])) == int(test_label[idx]):
            score += 1

    return score / len(test_data)

test_acc = test('test.npy', sentiment_analysis_model) # replace 'test.npy' with test_dir if using similar directory setup
print(f"Your method has the test accuracy of {test_acc*100}%")

Your method has the test accuracy of 61.0%
