# Sentiment analysis of movie revies

## Attempt 1 - Capturing word correclation in input data
 - Using one-hot-encoding

In [2]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the', 'cat', 'sat']
x = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]

print("Sent Encoding: " + str(x))


Sent Encoding: [1 1 0 1]


In [3]:
import numpy as np
import re
import pandas as pd

In [5]:
df = pd.read_csv("IMDB_Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Using methods found on
https://github.com/Akramz/grokking-deep-learning-notebooks/blob/master/11.NNs_that_Understand_Language.ipynb

In [7]:
import re
import numpy as np
import pandas as pd
from collections import Counter

df = pd.read_csv("IMDB_Dataset.csv")
df = df[df['sentiment'].isin(['negative', 'positive'])]
all_reviews_text = " ".join(df.review.tolist())

# get unique tokens
all_tokens = all_reviews_text.split(" ")
unique_tokens = [v for (v, _) in Counter(all_tokens).most_common(10000)]
len(all_tokens), len(unique_tokens)

(11557297, 10000)

In [9]:
# function to get unique tokens
def get_tokens(text):
    return list(set(text.split(" ")))

# one-hot representation of tokens
word_to_index, index_to_word = {}, {}
for i,word in enumerate(unique_tokens):
    word_to_index[word], index_to_word[i] = i, word
    
df['words_count'] = df['review'].apply(lambda x: len(x.split(" ")))

In [10]:
df.describe()

Unnamed: 0,words_count
count,50000.0
mean,231.14594
std,171.326419
min,4.0
25%,126.0
50%,173.0
75%,280.0
max,2470.0


In [17]:
test_idx = int(len(df) * (1-0.2))
train, test = df.iloc[:test_idx], df.iloc[test_idx:]
train.shape, test.shape

((40000, 3), (10000, 3))

In [18]:
# delete columns not interested in
train = train.drop(columns=['words_count'])

# transform label into numbers 1 and 0 instead of 'positive' and 'negative'
train['y'] = train['sentiment'].replace({'negative': 0, 'positive': 1})
train = train.drop('sentiment', axis=1)

In [19]:
# shuffle train
train = train.sample(frac=1).reset_index(drop=True)

In [20]:
x, y = [], []
for _, r in train.iterrows():
    review, label = r['review'], r['y']
    one_hot = np.zeros(10000)
    tokens = get_tokens(review)
    for token in tokens:
        if token in word_to_index:
            one_hot[word_to_index[token]] = int(1)
    x.append(one_hot)
    y.append(label)
    
x, y = np.array(x), np.array(y)
x.shape, y.shape

((40000, 10000), (40000,))

# Back to grokking

In [1]:
# Convert dataframe into list to later convert into text file
import pandas as pd

df = pd.read_csv('IMDB_Dataset.csv', encoding='ISO-8859-1')

review_list = list(df['review'])
review_txt = '\n'.join(str(i) for i in review_list)


In [3]:
reviews_file = open('reviews.txt', 'w', encoding='utf-8')
reviews_file.write(review_txt)
reviews_file.close()

In [4]:
labels_list = list(df['sentiment'])
labels_txt = '\n'.join(str(i) for i in labels_list)

labels_file = open('labels.txt', 'w', encoding='utf-8')
labels_file.write(labels_txt)
labels_file.close()

In [5]:
# Organize data so input and outputs are one-hot encoded
import sys

f = open('reviews.txt', encoding='utf-8')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt', encoding='utf-8')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word) > 0):
            vocab.add(word)       
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))
    
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [6]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1+np.exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = (0,0)
for iter in range(iterations):
    
    for i in range(len(input_dataset) - 1000):
        
        x,y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter: ' + str(iter) + \
                             ' Progress: ' + progress[2:4] + \
                             '.' + progress[4:6] + \
                             '% Training Accuracy: ' + str(correct/float(total)) + '%')
            
    print()

correct, total = (0,0)
for i in range(len(input_dataset) - 1000, len(input_dataset)):
    
    x = input_dataset[i]
    y = target_dataset[i]
    
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    
    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1

print("Test Accuracy: " + str(correct/float(total)))

Iter: 0 Progress: 97.99% Training Accuracy: 0.8373061224489796%%
Iter: 1 Progress: 97.99% Training Accuracy: 0.871969387755102%%
Test Accuracy: 0.878


In [7]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)

In [8]:
print(similar('beautiful'))

[('beautiful', -0.0), ('brilliant', -0.7376469448980678), ('flaws', -0.7426303919054603), ('seen', -0.7513922646377912), ('images', -0.7545795362360261), ('touched', -0.7584021301887172), ('Very', -0.7586325897655651), ('greatest', -0.7630659090876402), ('human', -0.7689779874584327), ('entertaining.', -0.7749484374965475)]


In [9]:
print(similar('terrible'))

[('terrible', -0.0), ('dull', -0.7525054527332602), ('poor', -0.7771535579800782), ('wasted', -0.8032560501151421), ('terrible.', -0.8034405695879778), ('boring', -0.8096819951667252), ('save', -0.8188097348448568), ('annoying', -0.8282798200104838), ('lacks', -0.834074802624816), ('awful', -0.834399468282829)]


# Filling in blanks
 - Method: take a 5-word phrase, remove 1 word (the focus term) and train network to figure out identity of missing word.
 - Using negative sampling to make network train faster

In [11]:
import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)

f = open('reviews.txt', encoding='utf-8')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size)*0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]
    
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]] + list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        
        left_context = review[max(0, target_i - window):target_i]
        right_context = review[target_i + 1:min(len(review), target_i + window)]
        
        layer_1 = np.mean(weights_0_1[left_context + right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context + right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha
        
    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress: ' + str(rev_i / float(len(input_dataset) * iterations)) + "   " + str(similar('terrible')))
    sys.stdout.write('\rProgress: ' + str(rev_i / float(len(input_dataset) * iterations)))
    
print(similar('terrible'))

Progress: 0.99999  [('terrible', -0.0), ('horrible', -2.775671217230461), ('dreadful', -3.203804975766528), ('fantastic', -3.580835224666613), ('lousy', -3.7011184386160787), ('pathetic', -3.769506362292975), ('lame', -3.7961110393621946), ('horrendous', -3.8040842281765364), ('laughable', -4.001209449364611), ('ridiculous', -4.117619247044518)]]]6)]]1907)][('terrible', -0.0), ('horrible', -2.911432471751861), ('dreadful', -3.1629894537686227), ('fantastic', -3.5388019083545452), ('lousy', -3.6894634746740054), ('pathetic', -3.8112367548603445), ('horrendous', -3.831434468030712), ('lame', -3.8588036872828066), ('laughable', -4.117263531810926), ('brilliant', -4.148704707106869)]


In [13]:
print(similar('beautiful'))

[('beautiful', -0.0), ('gorgeous', -3.5970900580233685), ('charming', -3.795812117164108), ('brave', -3.8063765666782885), ('lively', -3.8711378796818425), ('lovely', -3.933317596516982), ('courageous', -3.9400974521465657), ('lovable', -3.9484302618013523), ('seductive', -3.9813009841460403), ('powerful', -3.981643660175518)]


In [14]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [15]:
analogy(['terrible', 'good'], ['bad'])

[('superb', -262.43094205338565),
 ('fine', -262.47434201116715),
 ('good', -262.54619821506054),
 ('nice', -262.6855968021468),
 ('terrific', -262.7253012989536),
 ('terrible', -262.78904062616385),
 ('fantastic', -262.81306254530006),
 ('great', -262.87349007871916),
 ('horrible', -262.9607896948791)]

In [18]:
analogy(['king', 'woman'], ['man'])

[('fact', -211.2211832325717),
 ('woman', -211.2846870762818),
 ('leader', -211.28943223150745),
 ('Aside', -211.3793761037257),
 ('guy', -211.4136170471712),
 ('tale', -211.5977222820525),
 ('essence', -211.6302082032072),
 ('realm', -211.65907507067826),
 ('notion', -211.66776224649695)]