In [130]:
import sys
import time
import numpy as np
from collections import Counter

In [131]:
# Read reviews and labels from the file
reviews = open('reviews.txt', 'r')
labels = open('labels.txt', 'r')

reviews = map(lambda r: r[:-1], reviews.readlines())
labels = map(lambda l: l[:-1], labels.readlines())

print('This is the first review: {}\n'.format(reviews[0]))
print('This is the associated label: {}'.format(labels[0]))

This is the first review: bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   

This is the associated label: positive


In [132]:
total_words = Counter()
total_labels = Counter()
positive_words = Counter()
negative_words = Counter()

# Segregate words on the basis of positive or negative review
for i in range(len(reviews)):
    total_labels[labels[i]] += 1
    for word in reviews[i].split(" "):
        total_words[word] += 1
        if labels[i] == 'positive':
            positive_words[word] += 1
        else:
            negative_words[word] += 1


In [133]:
vocab = set(total_words.keys())
label_vocab = set(total_labels.keys())

vocab_size = len(vocab)
label_vocab_size = len(label_vocab)

pn_ratios = Counter()

# We are ignoring words which have a small word count
# since they have significantly less impact as to how a review can be
# positive or negative
for word, count in total_words.most_common():
    if(count < 100):
        continue
    pn_ratios[word] = positive_words[word] / float(negative_words[word] + 1)
    
# Here we find the ratio of the words present in positive reviews
# to negative reviews, and find a different ratio for if the ratio is 1
# add an offset to it, since the log of 1 is 0, which wouldn't be of lot of use
# to neural network
for word, ratio in pn_ratios.most_common():
    if(ratio > 1):
        pn_ratios[word] = np.log(ratio)
        continue
    pn_ratios[word] = np.log(ratio + 0.01)

In [134]:
list(pn_ratios.most_common())[0:30] # positive words have a higher ratio, and negative words have lower ratio

[('edie', 4.6913478822291435),
 ('paulie', 4.0775374439057197),
 ('felix', 3.1527360223636558),
 ('polanski', 2.8233610476132043),
 ('matthau', 2.8067217286092401),
 ('victoria', 2.6810215287142909),
 ('mildred', 2.6026896854443837),
 ('gandhi', 2.5389738710582761),
 ('flawless', 2.451005098112319),
 ('superbly', 2.2600254785752498),
 ('perfection', 2.1594842493533721),
 ('astaire', 2.1400661634962708),
 ('captures', 2.0386195471595809),
 ('voight', 2.0301704926730531),
 ('wonderfully', 2.0218960560332353),
 ('powell', 1.9783454248084671),
 ('brosnan', 1.9547990964725592),
 ('lily', 1.9203768470501485),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('refreshing', 1.8551812956655511),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('lemmon', 1.8458266904983307),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('beautifully', 1.7626953362841438),
 ('socc

In [135]:
list(reversed(pn_ratios.most_common()))[0:30]

[('boll', -4.0778152602708904),
 ('uwe', -3.9218753018711574),
 ('seagal', -3.3202501058581921),
 ('unwatchable', -3.0269848170580955),
 ('stinker', -2.9876839403711624),
 ('mst', -2.7753833211707972),
 ('incoherent', -2.7641396677532537),
 ('unfunny', -2.5545257844967644),
 ('waste', -2.4907515123361046),
 ('blah', -2.4475792789485),
 ('horrid', -2.3715779644809971),
 ('pointless', -2.3451073877136341),
 ('atrocious', -2.3187369339642556),
 ('redeeming', -2.2667790015910296),
 ('prom', -2.2601040980178784),
 ('drivel', -2.2476029585766928),
 ('lousy', -2.2118080125207054),
 ('worst', -2.1930856334332267),
 ('laughable', -2.172468615469592),
 ('awful', -2.1385076866397488),
 ('poorly', -2.1326133844207011),
 ('wasting', -2.1178155545614512),
 ('remotely', -2.111046881095167),
 ('existent', -2.0024805005437076),
 ('boredom', -1.9241486572738005),
 ('miserably', -1.9216610938019989),
 ('sucks', -1.9166645809588516),
 ('uninspired', -1.9131499212248517),
 ('lame', -1.9117232884159072),
 (

In [136]:
input_layer = np.zeros((1, len(vocab)))

In [137]:
word2index = {}
label2index = {}

for i,word in enumerate(vocab):
    word2index[word] = i
    
for i, label in enumerate(label_vocab):
    label2index[label] = i

In [138]:
# We update the input layer, everytime a review is run through the neural network
# so we will need to set the input layer to its initial state of all the node values
# being zero.
def update_input_layer(review):    
    global layer_0

    layer_0 *= 0
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

In [139]:
# Get the required number depending upon the sentiment exhibited by the label
def get_number_for_label(label):
    return 1 if label == 'positive' else 0

In [146]:
class SentimentAnalysisNetwork:
    def __init__(self, reviews, labels, output_nodes=1, hidden_nodes = 10, learning_rate = 0.1):
        """
        Initialize the network by specifying the number of nodes and LR.
        """
        self.input_nodes = vocab_size
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate
        self.input_layer = np.zeros((1, self.input_nodes))
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
        self.weights_1_2 = np.random.normal(
            0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes)
        )
        
    def update_input_layer(self,review):
        """
        Set the input layer to be 0 after every review.
        """
        self.input_layer *= 0
        for word in review.split(" "):
            if(word in word2index.keys()):
                self.input_layer[0][word2index[word]] = 1
                
    def get_target_for_label(self,label):
        return 1 if label == 'positive' else 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            hidden_layer = self.input_layer.dot(self.weights_0_1)

            # Output layer
            output_layer = self.sigmoid(hidden_layer.dot(self.weights_1_2))
            ### Backward pass ###

            # Output error
            output_layer_error = output_layer - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            output_layer_delta = output_layer_error * self.sigmoid_output_2_derivative(output_layer)

            # Backpropagated error
            hidden_layer_error = output_layer_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            hidden_layer_delta = hidden_layer_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= hidden_layer.T.dot(output_layer_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.input_layer.T.dot(hidden_layer_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(output_layer_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        self.update_input_layer(review.lower())
        hidden_layer = self.input_layer.dot(self.weights_0_1)
        layer_2 = self.sigmoid(hidden_layer.dot(self.weights_1_2))
        return 'positive' if layer_2[0] > 0.5 else 'negative'   

In [147]:
san = SentimentAnalysisNetwork(reviews[:-1000],labels[:-1000])

In [148]:
#san.test(reviews[-1000:],labels[-1000:])

In [149]:
san.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):0.853 #Correct:1945 #Trained:2501 Training Accuracy:77.7%
Progress:20.8% Speed(reviews/sec):0.207 #Correct:3994 #Trained:5001 Training Accuracy:79.8%
Progress:31.2% Speed(reviews/sec):0.274 #Correct:6116 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):0.329 #Correct:8272 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):0.334 #Correct:10419 #Trained:12501 Training Accuracy:83.3%
Progress:62.5% Speed(reviews/sec):0.228 #Correct:12561 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):0.251 #Correct:14681 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):0.271 #Correct:16862 #Trained:20001 Training Accuracy:84.3%
Progress:93.7% Speed(reviews/sec):0.292 #Correct:19043 #Trained:22501 Training Accuracy:84.6%
Progress:99.9% Speed(reviews/sec):0.304 #Correct:20365 #Trained:24000 Training 

In [150]:
# Potential fitfalls of the network - go  through nano degree videos again!

# 1. There are vocab words which are not present in the review, such as 'this movie is bad', now the word
# good has a count of 0, and hence when it is multiplied with the weights it does not contribute anything, 
# so we can ignore all the words with word count as 0, which reduces the number of calculations significantly.

# 2. You could also use NLP to see what would be the potential words of interests in a review, such as verbs
# and adjectives and ignore names, pronouns or prepositions.

# 3. Please dont feed the number of words present in a review, instead just take if the word appears just
# to be it 1. So word count for a word appearing in a review is always one.

In [151]:
san.test(reviews[-200:],labels[-200:])

Progress:99.5% Speed(reviews/sec):0.805% #Correct:173 #Tested:200 Testing Accuracy:86.5%