## Background

This project will perform sentiment analysis on statements to determine if they are positive or negative reviews of a movie. 

The dataset being used to train and test is a subset of the Stanford Sentiment Treebank containing 400 phrases which are split between equally between positive and negative.


## 1. Load training data

In [None]:
# This section is for loading the training data from the google drive directory
import os

from google.colab import drive
drive.mount('/content/drive')
dir_root = '/content/drive/MyDrive/Colab Notebooks/Assignment_1'
train_file = os.path.join(dir_root, 'train.txt')

In [None]:
# load training data
f = open('train.txt', 'r') # replace train.txt with train_file if using similar directory setup
train_lines = f.readlines()
f.close()

# split the data into data and labels
train_data, train_label = [], []
for i in train_lines:
  temp = i.split('|')
  train_data.append(temp[0])
  train_label.append(int(temp[1]))

#preview some data here (for testing)
preview = 10 # change preview # to see more or less data
for i in range(preview):
    print(f'Phrase \"{train_data[i]}\" has the sentiment {train_label[i]}')

## 2. Manually inspect word frequencies

In [None]:
def word_frequency(data, label):
    words = dict()
    for i in range(len(data)):
      temp = data[i].split(" ")
      for j in temp:
        if j.lower() in words: # if the word is already in the dictionary
          words[j.lower()]['total'] += 1
          if label[i] == 1:
            words[j.lower()]['pos'] += 1
          else:
            words[j.lower()]['neg'] += 1
        else:
          if label[i] == 1: # if the word is not in the dictionary
            words[j.lower()] = {'total': 1, 'pos': 1, 'neg': 0}
          else:
            words[j.lower()] = {'total': 1, 'pos': 0, 'neg': 1}
    return words

words =  word_frequency(train_data, train_label)
print('total' in words['best'] and 'pos' in words['best'] and 'neg' in words['best']) # True
print(words['best']) # {'total': 12, 'pos': 12, 'neg': 0}

print('Top 20 positive words (largest ratio in positive posts)')
display(sorted(words.items(), key=lambda x: (x[1]['pos']/x[1]['total'], x[1]['total']), reverse=True)[:20])
# [('best', {'total': 12, 'pos': 12, 'neg': 0}),
# ('brilliant', {'total': 6, 'pos': 6, 'neg': 0}), ...
print('Top 20 negative words (largest ratio in negative posts)')
display(sorted(words.items(), key=lambda x: (x[1]['neg']/x[1]['total'], x[1]['total']), reverse=True)[:20])
# [('i', {'total': 11, 'pos': 0, 'neg': 11}),
#  ('bad', {'total': 10, 'pos': 0, 'neg': 10}), ...

## 3. Create the classifier

The handcrafted classifier for the sentiment analysis model was created through trial and error using the evaluate function with the train data.

In [None]:
def sentiment_analysis_model(phrase):
    # handcrafted classifier
    pos = ['best', 'brilliant', 'films', 'work', 'first', 'love', 'recent', 'often', 'easily', 'performances', 'deserves', 'funny', 'well', 'triumph', 'moving', 'beautiful', 'season', 'dazzling']
    neg = ['i', 'if', 'bad', 'worst', 'unlikable', 'utterly', 'pathetic', 'dull', 'pointless', 'ugly', 'barely', 'incompetent', 'waste', 'product', 'annoying', 'stale']

    for i in pos:
      if i in phrase.lower(): return 1
    for i in neg:
      if i in phrase.lower(): return -1

    return 1 # default to positive

def evaluate(func, data, label):
    # evaluate the accuracy of the model passed in func
    correct = 0
    total = 0
    for i in range(len(data)):
      if func(data[i]) == label[i]:
        correct += 1
        total += 1
      else:
        total += 1

    accuracy = correct / total
    return accuracy

# use training data to evaluate the model
train_acc = evaluate(sentiment_analysis_model, train_data, train_label)
print(f"Your method has the training accuracy of {train_acc*100}%")

## 4. Evaluate

In [None]:
# This section is for loading the test data from the google drive directory
import sys
sys.path.append(dir_root)
test_dir = os.path.join(dir_root, 'test.npy')

In [None]:
import numpy as np

# function that compares model output to test data
def test(test_dir, func):

    (test_data, test_label) = np.load(test_dir)

    score = 0
    for idx in range(len(test_data)):
        if int(func(test_data[idx])) == int(test_label[idx]):
            score += 1

    return score / len(test_data)

test_acc = test('test.npy', sentiment_analysis_model) # replace 'test.npy' with test_dir if using similar directory setup
print(f"Your method has the test accuracy of {test_acc*100}%")