In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.metrics import confusion_matrix

import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/jun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Transform Raw texts into training and development data

In [2]:
data_dev = pd.read_csv('dev.tsv', sep='\t')
data_test = pd.read_csv('test.tsv', sep='\t')
data_train = pd.read_csv('train.tsv', sep='\t')

data_dev_phrases = list(data_dev['Phrase'])
data_test_phrases = list(data_test['Phrase'])
data_train_phrases = list(data_train['Phrase'])

data_dev_sentiments_5 = list(data_dev['Sentiment'])
data_train_sentiments_5 = list(data_train['Sentiment'])

## Map to 3-value Sentiment Scale

In [3]:
def map_to_3_value(sentiments):
    value_scale = {
        0: 0,
        1: 0,
        2: 1,
        3: 2,
        4: 2,
    }
    
    return np.array([value_scale[sentiment] for sentiment in sentiments])

In [4]:
data_dev_sentiments_3 = map_to_3_value(data_dev_sentiments_5)
data_train_sentiments_3 = map_to_3_value(data_train_sentiments_5)

print(data_train_sentiments_5[:20])
print(data_train_sentiments_3[:20])

[1, 4, 1, 3, 1, 4, 1, 3, 1, 1, 1, 1, 4, 3, 3, 3, 3, 2, 1, 2]
[0 2 0 2 0 2 0 2 0 0 0 0 2 2 2 2 2 1 0 1]


## Text Pre-Processing

1. Tokenisation
2. Remove stop words

In [5]:
default_stop_words = {
    'a', 'ad', 'after', 'again', 'all', 'also', 'am', 'an', 'and', 'any',
    'are', 'as', 'at', 'be', 'because', 'been', 'being', 'between', 'both',
    'but', 'by', 'can', 'could', 'does', 'each', 'ed', 'eg', 'either', 'etc',
    'even', 'ever', 'every', 'for', 'from', 'had', 'has', 'have', 'he', 'her',
    'hers', 'herself', 'him', 'himself', 'his', 'i', 'ie', 'if', 'in', 'inc',
    'into', 'is', 'it', 'its', 'itself', 'li', 'll', 'ltd', 'may', 'maybe',
    'me', 'might', 'mine', 'minute', 'minutes', 'must', 'my', 'myself',
    'neither', 'nor', 'now', 'of', 'on', 'only', 'or', 'other', 'our', 'ours',
    'ourselves', 'own', 'same', 'seem', 'seemed', 'shall', 'she', 'some',
    'somehow', 'something', 'sometimes', 'somewhat', 'somewhere', 'spoiler',
    'spoilers', 'such', 'suppose', 'that', 'the', 'their', 'theirs', 'them',
    'themselves', 'there', 'these', 'they', 'this', 'those', 'thus', 'to',
    'today', 'tomorrow', 'us', 've', 'vs', 'was', 'we', 'were', 'what',
    'whatever', 'when', 'where', 'which', 'who', 'whom', 'whose', 'will',
    'with', 'yesterday', 'you', 'your', 'yours', 'yourself', 'yourselves'
}

### Unigram extraction from a phrase

In [6]:
def extract_unigrams(phrase):
    return [
        word.lower() for word in re.findall(r'\b[A-Za-z]{2,}\b', phrase)
        if word.lower() not in default_stop_words
    ]

### Create a vocabulary of unigrams

In [7]:
vocab = { unigram for phrase in data_train_phrases 
         for unigram in extract_unigrams(phrase) }

print(list(vocab)[:50])

['horribly', 'forgotten', 'pepper', 'threefold', 'streaks', 'bullet', 'housing', 'nurtured', 'parts', 'diverges', 'childish', 'ratliff', 'bohos', 'acceptable', 'trumpet', 'discomfort', 'conquer', 'gamely', 'surveillance', 'vengefulness', 'reeses', 'serious', 'nancy', 'sinuously', 'giants', 'most', 'geriatric', 'frazzled', 'hearty', 'cosby', 'singh', 'intoxication', 'indistinct', 'limb', 'luscious', 'exposes', 'cyber', 'details', 'subliminally', 'violinist', 'bronze', 'perkiness', 'space', 'groen', 'develop', 'laundry', 'william', 'degree', 'reasonable', 'stolid']


#### Create vocabulary id -> word and word -> id dictionaries for reference:

In [8]:
vocab_id_to_word = dict(enumerate(vocab))

word_to_vocab_id = {v: k for k, v in vocab_id_to_word.items()}

#### Extract unigrams for each phrase in development, train, and test dataset

In [9]:
data_dev_unigrams = [extract_unigrams(phrase) for phrase in data_dev_phrases]

data_train_unigrams = [extract_unigrams(phrase) for phrase in data_train_phrases]

data_test_unigrams = [extract_unigrams(phrase) for phrase in data_test_phrases]

## Vectorise phrases

Vectorise the dataset into an array with dimentionality $N \times |vocab|$, where $N$ is the number of phrases and $|vocab|$ is the size of the vocabulary

In [10]:
def vectorise(data_unigrams, vocab):
    vec = []
    
    for unigrams in data_unigrams:
        counter = Counter(unigrams)
        vec.append([counter[v] for v in vocab])
    
    return np.array(vec)

In [11]:
data_dev_vec = vectorise(data_dev_unigrams, vocab)

data_train_vec = vectorise(data_train_unigrams, vocab)

data_test_vec = vectorise(data_test_unigrams, vocab)

## Naive Bayes Classification

### Calculate the prior probability of each class

In [12]:
def calculate_prior_probability(sentiments):    
    counter = Counter(sentiments)

    return np.array([v for (_, v) in sorted(counter.items())]) / len(sentiments)

### Calculate the probability of each word in vocabulary

In [13]:
def calculate_word_probability(total_vocab, data_unigrams, sentiments):
    probs = np.zeros((total_vocab, len(set(sentiments))))
    
    for i, unigrams in enumerate(data_unigrams):
        for word, count in Counter(unigrams).items():
            probs[word_to_vocab_id[word]][sentiments[i]] += count

    # With Laplace smoothing
    return np.log10(probs[:] + 1) / (probs.sum(axis=0) + total_vocab)

#### 3-value Sentiment scale

In [14]:
data_train_3_prob = calculate_prior_probability(data_train_sentiments_3)
word_prob_3 = calculate_word_probability(len(vocab), data_train_unigrams, data_train_sentiments_3)

print(data_train_3_prob, '\n')
print(word_prob_3)

[0.38331784 0.19577633 0.42090583] 

[[1.10654774e-05 0.00000000e+00 6.45502296e-06]
 [1.80470163e-05 1.07399478e-05 1.29100459e-05]
 [0.00000000e+00 1.07399478e-05 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 1.29100459e-05]
 [6.98153893e-06 0.00000000e+00 0.00000000e+00]
 [1.10654774e-05 1.07399478e-05 1.02309693e-05]]


#### 5-value Sentiment scale

In [15]:
data_train_5_prob = calculate_prior_probability(data_train_sentiments_5)
word_prob_5 = calculate_word_probability(len(vocab), data_train_unigrams, data_train_sentiments_5)

print(data_train_5_prob, '\n')
print(word_prob_5)

[0.12471776 0.25860008 0.19577633 0.27068668 0.15021915] 

[[2.04185927e-05 0.00000000e+00 0.00000000e+00 8.54010031e-06
  0.00000000e+00]
 [1.28826976e-05 2.06942801e-05 1.07399478e-05 1.35357387e-05
  1.18464443e-05]
 [0.00000000e+00 0.00000000e+00 1.07399478e-05 0.00000000e+00
  0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 8.54010031e-06
  1.87761700e-05]
 [0.00000000e+00 8.91254132e-06 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [0.00000000e+00 1.41260438e-05 1.07399478e-05 8.54010031e-06
  1.18464443e-05]]


### Predict development sentiments

In [16]:
def predict_sentiments(prior_prob, word_prob, data_vec):
    return np.argmax(data_vec.dot(prior_prob * word_prob), axis=1)

#### 3-value Sentiment scale

In [17]:
pred_dev_sentiments_3 = predict_sentiments(data_train_3_prob, word_prob_3, data_dev_vec)

confusion_matrix(data_dev_sentiments_3, pred_dev_sentiments_3)

array([[253,   0, 133],
       [ 71,   0, 110],
       [ 53,   0, 380]])

#### 5-value Sentiment scale

In [18]:
pred_dev_sentiments_5 = predict_sentiments(data_train_5_prob, word_prob_5, data_dev_vec)

confusion_matrix(data_dev_sentiments_5, pred_dev_sentiments_5)

array([[  1,  95,   2,  35,   0],
       [  0, 171,   3,  79,   0],
       [  1,  85,   4,  91,   0],
       [  3,  65,   1, 214,   0],
       [  0,  20,   0, 129,   1]])

### Predict test sentiments

#### 3-value Sentiment scale

In [19]:
pred_test_sentiments_3 = predict_sentiments(data_train_3_prob, word_prob_3, data_test_vec)

pred_test_sentiments_3

array([2, 2, 2, ..., 2, 0, 0])

#### 5-value Sentiment scale

In [20]:
pred_test_sentiments_5 = predict_sentiments(data_train_5_prob, word_prob_5, data_test_vec)

pred_test_sentiments_5

array([3, 3, 1, ..., 3, 1, 1])