In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter

### Transform Raw texts into training and development data

In [2]:
data_dev = pd.read_csv('dev.tsv', sep='\t')
data_test = pd.read_csv('test.tsv', sep='\t')
data_train = pd.read_csv('train.tsv', sep='\t')

data_dev_phrases = list(data_dev['Phrase'])
data_test_phrases = list(data_test['Phrase'])
data_train_phrases = list(data_train['Phrase'])

data_dev_sentiments_5 = list(data_dev['Sentiment'])
data_train_sentiments_5 = list(data_train['Sentiment'])

### Map to 3-value Sentiment Scale

In [3]:
def map_to_3_value(sentiments):
    value_scale = {
        0: 0,
        1: 0,
        2: 1,
        3: 2,
        4: 2,
    }
    
    return [value_scale[sentiment] for sentiment in sentiments]

In [4]:
data_dev_sentiments_3 = map_to_3_value(data_dev_sentiments_5)
data_train_sentiments_3 = map_to_3_value(data_train_sentiments_5)

print(data_train_sentiments_5[:20])
print(data_train_sentiments_3[:20])

[1, 4, 1, 3, 1, 4, 1, 3, 1, 1, 1, 1, 4, 3, 3, 3, 3, 2, 1, 2]
[0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 0, 1]


### Text Pre-Processing

1. Tokenisation
2. Remove stop words

In [5]:
default_stop_words = {
    'a', 'ad', 'after', 'again', 'all', 'also', 'am', 'an', 'and', 'any',
    'are', 'as', 'at', 'be', 'because', 'been', 'being', 'between', 'both',
    'but', 'by', 'can', 'could', 'does', 'each', 'ed', 'eg', 'either', 'etc',
    'even', 'ever', 'every', 'for', 'from', 'had', 'has', 'have', 'he', 'her',
    'hers', 'herself', 'him', 'himself', 'his', 'i', 'ie', 'if', 'in', 'inc',
    'into', 'is', 'it', 'its', 'itself', 'li', 'll', 'ltd', 'may', 'maybe',
    'me', 'might', 'mine', 'minute', 'minutes', 'must', 'my', 'myself',
    'neither', 'nor', 'now', 'of', 'on', 'only', 'or', 'other', 'our', 'ours',
    'ourselves', 'own', 'same', 'seem', 'seemed', 'shall', 'she', 'some',
    'somehow', 'something', 'sometimes', 'somewhat', 'somewhere', 'spoiler',
    'spoilers', 'such', 'suppose', 'that', 'the', 'their', 'theirs', 'them',
    'themselves', 'there', 'these', 'they', 'this', 'those', 'thus', 'to',
    'today', 'tomorrow', 'us', 've', 'vs', 'was', 'we', 'were', 'what',
    'whatever', 'when', 'where', 'which', 'who', 'whom', 'whose', 'will',
    'with', 'yesterday', 'you', 'your', 'yours', 'yourself', 'yourselves'
}

### Unigram extraction from a phrase

In [6]:
def extract_unigrams(phrase):
    return [
        word.lower() for word in re.findall(r'\b[A-Za-z]{2,}\b', phrase)
        if word.lower() not in default_stop_words
    ]

### Create a vocabulary of unigrams

In [7]:
def get_vocab(phrases):
    return {
        word.lower() for phrase in phrases 
        for word in re.findall(r'\b[A-Za-z]{2,}\b', phrase)
        if word.lower() not in default_stop_words
    }

In [8]:
vocab = { unigram for phrase in data_train_phrases 
         for unigram in extract_unigrams(phrase) }

print(list(vocab)[:50])

['accompanies', 'existential', 'wretchedly', 'partners', 'providing', 'friggin', 'times', 'susan', 'dual', 'pressed', 'available', 'ramblings', 'plainness', 'text', 'tightrope', 'load', 'agreeably', 'underdeveloped', 'dismissive', 'walls', 'screens', 'infatuation', 'plucky', 'bruce', 'giddy', 'red', 'inquisitive', 'thurman', 'flops', 'absorption', 'joan', 'detention', 'simmer', 'profane', 'schumacher', 'topple', 'beers', 'plunging', 'whirlwind', 'bardem', 'entranced', 'accentuating', 'rejected', 'enthusiasts', 'directors', 'turkey', 'snappy', 'cars', 'thoughtlessly', 'butthead']


#### Create vocabulary id -> word and word -> id dictionaries for reference:

In [9]:
vocab_id_to_word = dict(enumerate(vocab))

word_to_vocab_id = {v: k for k, v in vocab_id_to_word.items()}

#### Extract unigrams for each phrase in development, train, and test dataset

In [10]:
data_dev_unigrams = [extract_unigrams(phrase) for phrase in data_dev_phrases]

data_train_unigrams = [extract_unigrams(phrase) for phrase in data_train_phrases]

data_test_unigrams = [extract_unigrams(phrase) for phrase in data_test_phrases]

### Calculate the prior probability of each class

In [11]:
def calculate_prior_probability(sentiments):    
    counts = Counter(sentiments)
    
    total = len(sentiments)
    
    return { k: v / total for (k, v) in sorted(counts.items()) }

In [12]:
data_dev_5_prob = calculate_prior_probability(data_dev_sentiments_5)
data_train_5_prob = calculate_prior_probability(data_train_sentiments_5)

data_dev_3_prob = calculate_prior_probability(data_dev_sentiments_3)
data_train_3_prob = calculate_prior_probability(data_train_sentiments_3)

### Calculate the probability of each word in vocabulary

In [13]:
def calculate_word_probability(total_vocab, data_unigrams, sentiments):
    array = np.zeros((total_vocab, len(set(sentiments))))
    
    for i, unigrams in enumerate(data_unigrams):
        for word, count in Counter(unigrams).items():
            array[word_to_vocab_id[word]][sentiments[i]] += count

    # With Laplace smoothing
    return (array[:] + 1) / (array.sum(axis=0) + total_vocab)

#### 3-value Sentiment scale

In [14]:
word_prob_3 = calculate_word_probability(len(vocab), data_train_unigrams, data_train_sentiments_3)

word_prob_3

array([[2.31921703e-05, 7.13546684e-05, 4.28862442e-05],
       [4.63843406e-05, 3.56773342e-05, 1.07215611e-04],
       [4.63843406e-05, 3.56773342e-05, 2.14431221e-05],
       ...,
       [4.63843406e-05, 3.56773342e-05, 2.14431221e-05],
       [2.31921703e-05, 7.13546684e-05, 2.14431221e-05],
       [9.27686813e-05, 3.56773342e-05, 4.28862442e-05]])

#### 5-value Sentiment scale

In [15]:
word_prob_5 = calculate_word_probability(len(vocab), data_train_unigrams, data_train_sentiments_5)

word_prob_5

array([[4.27953952e-05, 2.96068214e-05, 7.13546684e-05, 5.67391983e-05,
        3.93530361e-05],
       [4.27953952e-05, 5.92136428e-05, 3.56773342e-05, 1.41847996e-04,
        3.93530361e-05],
       [4.27953952e-05, 5.92136428e-05, 3.56773342e-05, 2.83695991e-05,
        3.93530361e-05],
       ...,
       [4.27953952e-05, 5.92136428e-05, 3.56773342e-05, 2.83695991e-05,
        3.93530361e-05],
       [4.27953952e-05, 2.96068214e-05, 7.13546684e-05, 2.83695991e-05,
        3.93530361e-05],
       [8.55907904e-05, 8.88204642e-05, 3.56773342e-05, 5.67391983e-05,
        3.93530361e-05]])

### Predict development sentiments

In [16]:
def predict_sentiments(prior_prob, word_prob, data_unigrams, true_sentiments):
    total_sentiments = len(set(true_sentiments))
    
    confusion_matrix = np.zeros((total_sentiments, total_sentiments))
    
    # X-axis is predicted sentiments, Y-axis is true sentiments
    
    
    return confusion_matrix

#### 3-value Sentiment scale

In [17]:
pred_sentiments_3 = predict_sentiments(data_train_3_prob, word_prob_3, data_dev_unigrams, data_dev_sentiments_3)

pred_sentiments_3

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

#### 5-value Sentiment scale

In [18]:
pred_sentiments_5 = predict_sentiments(data_train_5_prob, word_prob_5, data_dev_unigrams, data_dev_sentiments_5)

pred_sentiments_5

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

### Predict test sentiments

#### 3-value Sentiment scale

#### 5-value Sentiment scale