# Sentiment Analysis

#### Import Libraries

In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import time
import sys
np.random.seed(1)

In [None]:
file = open('reviews.txt', 'r')
reviews = list(map(lambda x:x[:-1], file.readlines()))
file.close()

file = open('labels.txt', 'r')
labels = list(map(lambda x:x[:-1], file.readlines()))
file.close()

In [None]:
df = pd.DataFrame({'review': reviews, 'label': labels})

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['labels'].value_counts().plot(kind='barh', figsize=(15,2));
df['labels'].value_counts()

#### Count of words in each review

In [None]:
positive_words_count = Counter()
negative_words_count = Counter()
total_words_count = Counter()

In [None]:
for num, review in enumerate(df['review']):
    if df['label'][num-1] == 'POSITIVE':
        for word in review.split(" "):
            positive_words_count[word] += 1
            total_words_count[word] += 1
    else:
        for word in review.split(" "):
            negative_words_count[word] += 1
            total_words_count[word] += 1

In [None]:
positive_words_count[:10]

In [None]:
negative_words_count[:10]

In [None]:
total_words_count[:10]

#### Calculating Positive Negative Ratio for the most Repeating Words

In [None]:
# Calculating Positive Negative Ratio for the most Repeating Words
positive_negative_ratios = Counter()
margin = 100

for word, count in list(total_words_count.most_common()):
    if count > margin:
        ratio = positive_words_count[word] / negative_words_count[word]+1 # Avoiding Zero Division
        positive_negative_ratios[word] = ratio

#### Getting the Log of the Ratios

In [None]:
for word, ratio in positive_negative_ratios:
    if ratio > 1:
        positive_negative_ratios[word] = np.log(ratio)    
    else:
        positive_negative_ratios[word] = -np.log(1/(ratio+0.001)) # Avoiding Zero Division    

#### Most Common Positive Words

In [None]:
# Most Common Positive Words
positive_negative_ratios.most_common()

#### Most Common Negative Words

In [None]:
# Most Common Negative Words
reversed(positive_negative_ratios.most_common())

### Transform Text into Numbers

In [None]:
words = total_words_count.keys()
size = len(words)
size

In [3]:
words_index = {}

for index, word in enumerate(words):
    words_index[word] = index

In [5]:
input_layer = np.zeros((1,size))
input_layer

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

#### Building the Input Layer

In [None]:
def update_input(review):
    global input_layer
    
    # Clear previuos inputs
    input_layer *= 0
    
    # Getting the number of words in the review
    for word in review.split(" "):
        input_layer[0][words_index[word]] += 1
    return input_layer

In [None]:
update_input(df['review'][0])

### Building the Sentiment Analysis Neural Network

In [None]:
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes=10, learning_rate=0.01):
        self.pre_process(reviews, labels)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
        
    def pre_process(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        
        label_vocab =set()
        for label in labels:
            label_vocab.add(word)
        self.label_vocab = list(label_vocab)
        
        
        self.review_vocab_size = len(review_vocab)
        self.label_vocab_size = len(label_vocab)
        
        
        self.words_index = {}
        for index, word in enumerate(self.review_vocab):
            words_index[word] = index
            
        self.labels_index = {}
        for index, label in enumerate(self.label_vocab):
            labels_index[label] = index
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        self.learnig_rate = learnig_rate
        
        self.input_layer = np.zeros((1, input_nodes))
        
    def update_input(self, review):
        # Clear previuos inputs
        self.input_layer *= 0

        # Getting the number of words in the review
        for word in review.split(" "):
            self.input_layer[0][words_index[word]] += 1
        return self.input_layer
        
    def sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def sigmoid_output_2_dervitive(self, ouput):
        return ouput * (1-output)
    
    def train(self, training_reviews, training_labels):
        assert(len(training_reviews)==len(training_labels))
        
        correct = 0
        
        start_time = time.time()
        
        for num, review in enumerate(trainging_reviews):
            review = review
            label = training_labels[num]
            
            self.update_input_layer(review)
            layer_1 = self.input_layer.dot(self.weights_0_1)
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
            # Backward Pass
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_output_2_dervitive(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T)
            layer_1_delta = layer_1_error
            
            self.weights_1_2 = layer_1.T.dot(layer_2_delta) * self.learning_rate
            self.weights_0_1 = self.input_layer.T.dot(layer_1_delta) * self.learning_rate
            
            if np.abs(layer_2_error) <0.5:
                correct +=1
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
        
        
        end_time = time.time()
        training_time = end_time - start_time
        
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start_time = time.time()
        
        for num, review in enumerate(testing_reviews):
            pred = self.run(review)
            
            if pred == testing_label[num]:
                correct += 1
                
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
    def run(self, review):
        self.update_input_layer(review.lower())
        layer_1 = self.input_layer.dot(self.weights_0_1)
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
        if np.abs(layer_2_error) <0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"            

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], learning_rate=0.1)

In [None]:
classifier.test(reviews[-1000:], labels[-1000:])

In [None]:
classifier.train(reviews[:-1000], labels[:-1000])

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], learning_rate=0.01)

In [None]:
classifier.train(reviews[:-1000], labels[:-1000])

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], learning_rate=0.001)

In [None]:
classifier.train(reviews[:-1000], labels[:-1000])

#### Removing Noise
white spaces and so on..

In [None]:
review_counter = Counter()

In [None]:
for word in reviews[0].spllit(" "):
    review_counter[word] += 1 

In [None]:
review_counter.most_common()

#### Reducing Noise & Editing Classifier

In [None]:
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes=10, learning_rate=0.01):
        self.pre_process(reviews, labels)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
        
    def pre_process(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        
        label_vocab =set()
        for label in labels:
            label_vocab.add(word)
        self.label_vocab = list(label_vocab)
        
        
        self.review_vocab_size = len(review_vocab)
        self.label_vocab_size = len(label_vocab)
        
        
        self.words_index = {}
        for index, word in enumerate(self.review_vocab):
            words_index[word] = index
            
        self.labels_index = {}
        for index, label in enumerate(self.label_vocab):
            labels_index[label] = index
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        self.learnig_rate = learnig_rate
        
        self.input_layer = np.zeros((1, input_nodes))
    
    #### Here the change ####
    #### Instead of puttinh the counts in, we add 1 if the word exists,
    #### thats to avoid the inbalance according to the number of spaces, or
    #### the number of  unuseful characters, but noise stell exist in the input data.
    #### why dont we remove noise from data using NLTK or any other library?
    def update_input(self, review):
        # Clear previuos inputs
        self.input_layer *= 0

        # Getting the number of words in the review
        for word in review.split(" "):
            #### Exactly here
            self.input_layer[0][words_index[word]] = 1 #=1 instead of +=1
        return self.input_layer
        
    def sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def sigmoid_output_2_dervitive(self, ouput):
        return ouput * (1-output)
    
    def train(self, training_reviews, training_labels):
        assert(len(training_reviews)==len(training_labels))
        
        correct = 0
        
        start_time = time.time()
        
        for num, review in enumerate(trainging_reviews):
            review = review
            label = training_labels[num]
            
            self.update_input_layer(review)
            layer_1 = self.input_layer.dot(self.weights_0_1)
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
            # Backward Pass
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_output_2_dervitive(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T)
            layer_1_delta = layer_1_error
            
            self.weights_1_2 = layer_1.T.dot(layer_2_delta) * self.learning_rate
            self.weights_0_1 = self.input_layer.T.dot(layer_1_delta) * self.learning_rate
            
            if np.abs(layer_2_error) <0.5:
                correct +=1
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
        
        
        end_time = time.time()
        training_time = end_time - start_time
        
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start_time = time.time()
        
        for num, review in enumerate(testing_reviews):
            pred = self.run(review)
            
            if pred == testing_label[num]:
                correct += 1
                
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
    def run(self, review):
        self.update_input_layer(review.lower())
        layer_1 = self.input_layer.dot(self.weights_0_1)
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
        if np.abs(layer_2_error) <0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"            

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], learning_rate=0.1)

In [None]:
classifier.train(reviews[:-1000], labels[:-1000])

In [None]:
classifier.test(reviews[-1000:], labels[-1000:])

### Analyzing Ineffieciencies in the Network

In [None]:
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes=10, learning_rate=0.01):
        self.pre_process(reviews, labels)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
        
    def pre_process(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        
        label_vocab =set()
        for label in labels:
            label_vocab.add(word)
        self.label_vocab = list(label_vocab)
        
        
        self.review_vocab_size = len(review_vocab)
        self.label_vocab_size = len(label_vocab)
        
        
        self.words_index = {}
        for index, word in enumerate(self.review_vocab):
            words_index[word] = index
            
        self.labels_index = {}
        for index, label in enumerate(self.label_vocab):
            labels_index[label] = index
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        self.learnig_rate = learnig_rate
        
        self.input_layer = np.zeros((1, input_nodes))
    

    def update_input(self, review):
        # Clear previuos inputs
        self.input_layer *= 0

        # Getting the number of words in the review
        for word in review.split(" "):
            #### Exactly here
            self.input_layer[0][words_index[word]] = 1 #=1 instead of +=1
        return self.input_layer
        
    def sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def sigmoid_output_2_dervitive(self, ouput):
        return ouput * (1-output)
    
    #### Here the change ####
    #### Instead of calculating everything, the zero inputs and the one inputs,
    #### we will calculate the one inputs only and save the time of the zero inputs,
    #### by taking the indices of the one inputs and map them into the weights inputs directly,
    #### that saves the time of multiplying the zeros(words not exist) by the wieghts that will
    #### become zero in the end, its more than 10 times faster.
    
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if word in self.words_index.keys():
                    indices.add(self.words_index[word])
                training_reviews.append(list(indices))
        
        
        assert(len(training_reviews)==len(training_labels))
        
        correct = 0
        
        start_time = time.time()
        
        for num, review in enumerate(trainging_reviews):
            review = review
            label = training_labels[num]
            
            # self.update_input_layer(review)
            #layer_1 = self.input_layer.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
            # Backward Pass
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_output_2_dervitive(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T)
            layer_1_delta = layer_1_error
            
            #self.weights_1_2 = layer_1.T.dot(layer_2_delta) * self.learning_rate
            #self.weights_0_1 = self.input_layer.T.dot(layer_1_delta) * self.learning_rate
            
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate
            
            for index in review:
                self.weights_0_1[index] -= self.layer_1_delta[0] * self.learning_rate
            
            if np.abs(layer_2_error) <0.5:
                correct +=1
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
        
        
        end_time = time.time()
        training_time = end_time - start_time
        
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start_time = time.time()
        
        for num, review in enumerate(testing_reviews):
            pred = self.run(review)
            
            if pred == testing_label[num]:
                correct += 1
                
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
    def run(self, review):
        self.update_input_layer(review.lower())
        layer_1 = self.input_layer.dot(self.weights_0_1)
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
        if np.abs(layer_2_error) <0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"            

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], learning_rate=0.1)

In [None]:
classifier.train(reviews[:-1000] * 2, labels[:-1000] * 2) # Multiple iterations training

In [None]:
classifier.test(reviews[-1000:], labels[-1000:])

### Further Noise Reduction

In [None]:
from bokeh.models import ColumnDataStructure, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

output_notebook()

In [None]:
hist, edges = np.histogram(list(map(lambda x:x[1], positive_negative_ratios.most_common())), 
                          density=True, bins=100,normed=True)

p = figure(tools='pan,wheel_zoom,reset,save', 
           toolbar_location='above',
           title='Word Positive/Negative Affinity Distribution')

p.quad(top=hist, bootom=0, left=edges[:-1], right=edges[1:], line_color='#555555')
show(p)

In [None]:
frequency = Counter()

for word, cnt in total_words_count.most_commn():
    frequency[cnt] += 1

In [None]:
hist, edges = np.histogram(list(map(lambda x:x[1], frequency.most_common())), 
                          density=True, bins=100,normed=True)

p = figure(tools='pan,wheel_zoom,reset,save', 
           toolbar_location='above',
           title='The Frequency Distribution of hte Words in our Corpus')

p.quad(top=hist, bootom=0, left=edges[:-1], right=edges[1:], line_color='#555555')
show(p)

In [None]:
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes=10, learning_rate=0.01):
        self.pre_process(reviews, labels)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
    
    #### Here the change ####
    
    def pre_process(self, reviews, labels, min_count, polarity_cutoff):
        
        positive_words_count = Counter()
        negative_words_count = Counter()
        total_words_count = Counter()
        
        for i in range(len(reviews)):
            if labels[i] == 'POSITIVE':
                for word in reviews[i].split(" "):
                    positive_words_count[word] += 1
                    total_words_count[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_words_count[word] += 1
                    total_words_count[word] += 1
            
        positive_negative_ratios = Counter()
        
        for term, count in list(total_words_count.most_common()):
            if count >=50:
                ratio = positive_words_count[term] / negative_words_count[word]+1 # Avoiding Zero Division
                positive_negative_ratios[word] = ratio
        
        
        for word, ratio in positive_negative_ratios.most_common():
            if ratio > 1:
                positive_negative_ratios[word] = np.log(ratio)    
            else:
                positive_negative_ratios[word] = -np.log(1/(ratio+0.001)) # Avoiding Zero Division    
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if total_words_count[word] > min_count:
                    if word in positive_negative_ratios.keys():
                        if (positive_negative_ratios[word] >= polarity_cutoff) or (positive_negative_ratios[word] <= -polarity_cutoff):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
                        
        self.review_vocab = list(review_vocab)
        
        label_vocab =set()
        for label in labels:
            label_vocab.add(word)
        self.label_vocab = list(label_vocab)
        
        
        self.review_vocab_size = len(review_vocab)
        self.label_vocab_size = len(label_vocab)
        
        
        self.words_index = {}
        for index, word in enumerate(self.review_vocab):
            words_index[word] = index
            
        self.labels_index = {}
        for index, label in enumerate(self.label_vocab):
            labels_index[label] = index
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes))
        self.learnig_rate = learnig_rate
        
        self.input_layer = np.zeros((1, input_nodes))
    

    def update_input(self, review):
        # Clear previuos inputs
        self.input_layer *= 0

        # Getting the number of words in the review
        for word in review.split(" "):
            #### Exactly here
            self.input_layer[0][words_index[word]] = 1 #=1 instead of +=1
        return self.input_layer
        
    def sigmoid(self, x):
        return 1 / (1+np.exp(-x))
    
    def sigmoid_output_2_dervitive(self, ouput):
        return ouput * (1-output)
    
    
    #### Instead of calculating everything, the zero inputs and the one inputs,
    #### we will calculate the one inputs only and save the time of the zero inputs,
    #### by taking the indices of the one inputs and map them into the weights inputs directly,
    #### that saves the time of multiplying the zeros(words not exist) by the wieghts that will
    #### become zero in the end, its more than 10 times faster.
    
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if word in self.words_index.keys():
                    indices.add(self.words_index[word])
                training_reviews.append(list(indices))
        
        
        assert(len(training_reviews)==len(training_labels))
        
        correct = 0
        
        start_time = time.time()
        
        for num, review in enumerate(trainging_reviews):
            review = review
            label = training_labels[num]
            
            # self.update_input_layer(review)
            #layer_1 = self.input_layer.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
            # Backward Pass
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_output_2_dervitive(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T)
            layer_1_delta = layer_1_error
            
            #self.weights_1_2 = layer_1.T.dot(layer_2_delta) * self.learning_rate
            #self.weights_0_1 = self.input_layer.T.dot(layer_1_delta) * self.learning_rate
            
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate
            
            for index in review:
                self.weights_0_1[index] -= self.layer_1_delta[0] * self.learning_rate
            
            if np.abs(layer_2_error) <0.5:
                correct +=1
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
        
        
        end_time = time.time()
        training_time = end_time - start_time
        
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start_time = time.time()
        
        for num, review in enumerate(testing_reviews):
            pred = self.run(review)
            
            if pred == testing_label[num]:
                correct += 1
                
            reviews_per_second = num / (time.time() - start_time)
            
            progress = (100 * num/len(training_reviews))[:4]
            
            sys.stdout.write('\rProgress: {}', 'Speed(review/sec): {}'.format(progress,reviews_per_second))
    def run(self, review):
        self.update_input_layer(review.lower())
        layer_1 = self.input_layer.dot(self.weights_0_1)
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
        if np.abs(layer_2_error) <0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"            

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], min_count=20, polarity_cutoff=0.05, learning_rate=0.01)

In [None]:
classifier.train(reviews[:-1000] * 2, labels[:-1000] * 2) # Multiple iterations training

In [None]:
classifier.test(reviews[-1000:], labels[-1000:])

In [None]:
classifier = SentimentNetwork(reviews[:-1000], labels[:-1000], min_count=20, polarity_cutoff=0.8, learning_rate=0.01)

In [None]:
classifier.train(reviews[:-1000] * 2, labels[:-1000] * 2) # Multiple iterations training

In [None]:
classifier.test(reviews[-1000:], labels[-1000:])

In [None]:
def get_similar_words(focus='excellent'):
    most_similar = Counter()
    
    for word in mlp_full.word_index.key():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_fullword_index[word]], mlp_full.weights_0_1[mlp_full.word_index[word]])
        
        return most_similar.most_common()

In [None]:
get_similar_words('excellent')

In [None]:
get_similar_words('horrible')

In [None]:
from matplotlib.colors import colors

words_to_visualize = list()

for word, ratio in positive_negative_ratios.most_common():
    if word in mlp_full.word_index.keys():
        words_to_visualize.append(word)
        
for word, ratio in list(reversed(positive_negative_ratios.most_common())):
    if word in mlp_full.word_index.keys():
        words_to_visualize.append(word)        

In [None]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()

for word in words_to_visualize:
    if word in positive_negative_ratios.keys():
        vectors_list.append(mlp.full.weights_0_1[mlp_full.word_index[word]])
        if positive_negative_ratios[word] > 0:
            pos += 1
            colors_list.append('#00ff00')
        else:
            neg += 1
            colors_list.append('#000000')
            

In [None]:
from sklearn.manifold import TSNE
tesne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [None]:
p = figure(tools='pan,wheel_zoom,reset,save', 
           toolbar_location='above',
           title='Vector TSNE for most Polarized Words')

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                   names=words_to_visualize))

p.scatter(x=x1, y=x2, size=8, source=source, color=colors_list)
          
word_labels = LabelSet(x=x1, y=x2, text='names', y_offset=6,
          text_font_size='8pt', text_color='#555555',
         source=source, text_align='center')

p.add_layout(word_labels)
show(p)