In [1]:
import sys

reviews_file = open('reviews.txt')
raw_reviews = reviews_file.readlines()
reviews_file.close()

labels_file = open('labels.txt')
raw_labels = labels_file.readlines()
labels_file.close()

tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

vocab = set()
for sentence in tokens:
    for word in sentence:
        if(len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

word_to_index = {}
for i, word in enumerate(vocab):
    word_to_index[word] = i

input_dataset = list()
for sentence in tokens:
    sentence_indices = list()
    for word in sentence:
        try:
            sentence_indices.append(word_to_index[word])
        except:
            ""
    input_dataset.append(list(set(sentence_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [2]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha = 0.01
iterations = 2
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = (0,0)
for iteration in range(iterations):
    for review in range(len(input_dataset) - 1000):
        x, y = (input_dataset[review], target_dataset[review])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis = 0)) # embed + activation
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2)) # linear + softmax

        layer_2_delta = layer_2 - y # Compares the prediction with the truth
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) # backpropagation

        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha # what is the outer product?

        if (np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(review % 10 == 9):
            progress = str(review / float(len(input_dataset)))
            sys.stdout.write('\rIteration: ' + str(iteration)\
                             + ' Progress: ' + progress[2:4] + "." + progress[4:6]\
                             + ' % Training Accuracy: ' + str(correct/float(total)) + '%                    ')
    print()
correct, total = (0,0)
for review in range(len(input_dataset) - 1000, len(input_dataset)):
    x = input_dataset[review]
    y = target_dataset[review]

    layer_1 = sigmoid(np.sum(weights_0_1[x], axis = 0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy: " + str(correct / float(total)))


Iteration: 0 Progress: 95.99 % Training Accuracy: 0.82975%                                
Iteration: 1 Progress: 95.99 % Training Accuracy: 0.8651041666666667%                    
Test Accuracy: 0.855


In [3]:
# gives a list of similar words, according to our model
# Euclidian distance
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word_to_index[target]
    scores = Counter()
    for word, index in word_to_index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10) 

In [12]:
print(similar('boring'))

[('boring', -0.0), ('terrible', -0.7671911636829402), ('disappointment', -0.7977708386989999), ('annoying', -0.8036612381532257), ('lacks', -0.8057097372208817), ('badly', -0.8115734081147161), ('dull', -0.8148593342230555), ('worse', -0.8179886765104978), ('mess', -0.8215656327971523), ('horrible', -0.826356208046731)]
