In [1]:
import pandas as pd

## Sampling the data

In [2]:
tweets=pd.read_csv('data/train.csv')
tweets.sample(5)

Unnamed: 0,id,keyword,location,text,target
1844,2651,crashed,Buenos Aires,MH370: Intact part lifts odds plane glided not...,1
706,1020,blazing,"Saint Marys, GA",The Blazing Elwoods @BlazingElwoods - Don't Bo...,0
2311,3319,demolished,,Why is CHURCH media and #Media420 silent when ...,1
899,1301,bloody,"Level 3 Garrison, Sector G",Bloody hell what a day. I haven't even really ...,0
3200,4593,emergency%20plan,"204, 555 11 Ave. S.W.",The Municipal Emergency Plan is now in effect....,1


In [73]:
sub = pd.read_csv('data/sample_submission.csv')
sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [75]:
test = pd.read_csv('data/test.csv')
test.sample(5)

Unnamed: 0,id,keyword,location,text
420,1359,blown%20up,Los Angeles,Lock up your last remaining shred of manhood a...
2097,7035,mayhem,"Lynchburg, VA",Oh Shelly your waffle wasn't 'a little blonde....
3196,10619,wounded,Singapore,Suspected Salvadoran gang members killed four ...
1952,6587,inundated,"Bellingham, WA",ÛÏWe live in an area thatÛªs inundated with ...
2152,7216,natural%20disaster,bishop amat,natural disaster on you half ass rappers


In [3]:
tweets.drop(labels=['id', 'keyword', 'location'], axis=1, inplace=True)
tweets.sample(5)

Unnamed: 0,text,target
2353,@MentalHealthGov like AHHhhh fix the ALEC made...,0
3845,@kelworldpeace @TAXSTONE yoga flames!,0
952,New Ladies Shoulder Tote Handbag Women Cross B...,0
2911,I keep it out down drown their insults out wit...,0
6436,13 security personnel killed in Iraq suicide b...,1


## Part 1: Building The Functions to process data

In [4]:
from collections import Counter
import numpy as np

In [5]:
# Create Counter objects to store counts for words in disaster/neutral tweets 
disaster_counts = Counter()
neutral_counts = Counter()
total_counts = Counter()

In [6]:
# Count the words in the reviews
for row in tweets.itertuples():
    if row.target == 1:
        for word in row.text.split(' '):
            disaster_counts[word] +=1
            total_counts[word] +=1
    else:
        for word in row.text.split(' '):
            neutral_counts[word]+=1
            total_counts[word] +=1

In [7]:
#disaster_counts.most_common()

In [8]:
#neutral_counts.most_common()

In [9]:
#total_counts.most_common()

Introduction of disaster/neutral ration to diferentiate between words that are only in disaster tweets

In [10]:
dis_neu_ratios = Counter()

for word, cnt in list(total_counts.most_common()):
    if (cnt > 20):
        dis_neu_ratio = disaster_counts[word] / float(neutral_counts[word] + 1)
        dis_neu_ratios[word] = dis_neu_ratio
        
## Calculate the ratios of disater and neutral uses of the most common words (used at least 100 times)
for word, ratio in dis_neu_ratios.most_common():
    if ratio >1:
        dis_neu_ratios[word] = np.log(ratio)
    else:
        dis_neu_ratios[word] = -np.log(1 / (ratio + 0.01))

In [11]:
# Examine the ratios
print(dis_neu_ratios['the'])
print(dis_neu_ratios['fire'])

-0.3565530115922747
0.9592567677575984


In [12]:
#dis_neu_ratios.most_common()

## Transforming text into numbers

In [13]:
# TODO: Create set named "vocab" containing all of the words from all of the tweets
vocab = set(total_counts.keys())
vocab_size=len(vocab)
print(f'Lengths of vocabulary: {vocab_size}')

Lengths of vocabulary: 32017


In [14]:
# Create a dictionary of words in the vocabulary mapped to index positions
word2index={}
for i, word in enumerate(vocab):
    word2index[word] = i
    
# display the map of words indices
#word2index

## Neutral Network

Encapsulated neural network in the calss

In [15]:
import time
import sys
import numpy as np

In [16]:
class SentimentNetwork:
    def __init__(self, tweets, min_count=10, polarity_cutoff=0.1, hidden_nodes=10, learning_rate=0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of tweets used for training
            labels(list) - List of DISASTER/NEUTRAL labels associated with the given tweets
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)
        
        self.pre_process_data(tweets, polarity_cutoff, min_count)
        
        self.init_network(len(self.tweet_vocab), hidden_nodes, 1, learning_rate)
        
    def pre_process_data(self, tweets, polarity_cutoff, min_count):
        """ Calculate disaster-to-neutral ratios before buidling vocabulary
        
        """
        disaster_counts = Counter()
        neutral_counts = Counter()
        total_counts = Counter()
        
        for row in tweets.itertuples():
            if row.target == 1:
                for word in row.text.split(' '):
                    disaster_counts[word] +=1
                    total_counts[word] +=1
            else:
                for word in row.text.split(' '):
                    neutral_counts[word]+=1
                    total_counts[word] +=1
        
        ## Calculate ratios
        dis_neu_ratios = Counter()

        for word, cnt in list(total_counts.most_common()):
            if (cnt > 20):
                dis_neu_ratio = disaster_counts[word] / float(neutral_counts[word] + 1)
                dis_neu_ratios[word] = dis_neu_ratio
        
        ## Calculate the ratios of disater and neutral uses of the most common words (used at least 100 times)
        for word, ratio in dis_neu_ratios.most_common():
            if ratio >1:
                dis_neu_ratios[word] = np.log(ratio)
            else:
                dis_neu_ratios[word] = -np.log(1 / (ratio + 0.01))  
            
        # populate tweets_vocab with all of the words in given tweet
        tweet_vocab = set()
        for tweet in tweets.itertuples():
            for word in tweet.text.split(' '):
                # only include words that occure more than min_count &
                # words that have the ratio that meets polarity_cutoff
                if (total_counts[word] > min_count):
                    if (word in dis_neu_ratios.keys()):
                        if ((dis_neu_ratios[word] >= polarity_cutoff) or (dis_neu_ratios[word] <= -polarity_cutoff)):
                            tweet_vocab.add(word)
                    else:
                        tweet_vocab.add(word)
                        
        # Convert the vocabulary set to a list so we can access words via indices
        self.tweet_vocab = list(tweet_vocab)
        
        # build labels into the list so we can access them via indices
        #label_vocab = list()
        #for label in tweets.itertuples():
        #    label_vocab.add(label.target)
        #self.label_vocab = label_vocab
        
        # Store the sizes of the review and label vocabularies.
        self.tweet_vocab_size = len(self.tweet_vocab)
        #self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        # TODO: populate self.word2index with indices for all the words in self.tweets_vocab
        #       like you saw earlier in the notebook
        for i,word in enumerate(self.tweet_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        #self.label2index = {}
        # TODO: do the same thing you did for self.word2index and self.review_vocab, 
        #       but for self.label2index and self.label_vocab instead
        #for i,label in enumerate(self.label_vocab):
        #    self.label2index[label] = i 
            
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Store the number of nodes in input, hidden, and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights
        
        # TODO: initialize self.weights_0_1 as a matrix of zeros. These are the weights between
        #       the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        
        # TODO: initialize self.weights_1_2 as a matrix of random values. 
        #       These are the weights between the hidden layer and the output layer.
        #self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,(self.hidden_nodes, self.output_nodes))
        self.weights_1_2 = np.random.normal(0.0, self.hidden_nodes**-0.5,(self.hidden_nodes, self.output_nodes))
        
        # New layer_1 that replaces layer_0
        self.layer_1 = np.zeros((1, hidden_nodes))
        
    def sigmoid(self,x):
        # TODO: Return the result of calculating the sigmoid activation function
        #       shown in the lectures
        return 1/(1+np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        # TODO: Return the derivative of the sigmoid activation function, 
        #       where "output" is the original output from the sigmoid function 
        return output * (1 - output)
    
    
    def train(self, training_tweets_raw):
        
        
        # Create vector that contains only positive indices
        training_tweets = list()
        training_labels = list()
        for tweet in training_tweets_raw.itertuples():
            indices = set()
            for word in tweet.text.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_tweets.append(list(indices)) 
            training_labels.append(tweet.target)
         
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0
        
        # Remember when we started for printing time statistics
        start = time.time()
        
        #
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        #for i in range(len(training_tweets)):
        for i,item in enumerate(training_tweets):
            tweet = item
            label = training_labels[i]
            
            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in tweet:
                self.layer_1 += self.weights_0_1[index]
                       
            layer_2 = self.sigmoid(np.dot(self.layer_1, self.weights_1_2))
            
            # TODO: Implement the back propagation pass here. 
            #       That means calculate the error for the forward pass's prediction
            #       and update the weights in the network according to their
            #       contributions toward the error, as calculated via the
            #       gradient descent and back propagation algorithms you 
            #       learned in class.
            layer_2_error = layer_2 - int(label)
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
            
            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error
            
            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in tweet:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

                                
            # TODO: Keep track of correct predictions. To determine if the prediction was
            #       correct, check that the absolute value of the output error 
            #       is less than 0.5. If so, add one to the correct_so_far count.
            if(layer_2 >= 0.5 and label == 1):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 0):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            if (i % 1000 == 0):
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_tweets)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
                print("")
                
                
    def test(self, testing_tweets):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        i = 0
        for row in testing_tweets.itertuples():
            pred = self.run(row.text)
            if(pred == row.target):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            if (i % 1 == 0):
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_tweets)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
            i+=1
    
    def run(self, tweets):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        
        self.layer_1 *= 0
        unique_indices = set()
        for word in tweets.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## New for Project 5: changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
                                
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return 1 #DISASTER
        else:
            return 0 #NEUTRAL

    

## Train the network - split training data which has labels into training and test data

In [55]:
mlp = SentimentNetwork(tweets[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(tweets[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:15.1% Speed(reviews/sec):52614 #Correct:736 #Trained:1001 Training Accuracy:73.5%
Progress:30.2% Speed(reviews/sec):51425 #Correct:1370 #Trained:2001 Training Accuracy:68.4%
Progress:45.3% Speed(reviews/sec):50746 #Correct:2002 #Trained:3001 Training Accuracy:66.7%
Progress:60.4% Speed(reviews/sec):51184 #Correct:2653 #Trained:4001 Training Accuracy:66.3%
Progress:75.6% Speed(reviews/sec):51310 #Correct:3284 #Trained:5001 Training Accuracy:65.6%
Progress:90.7% Speed(reviews/sec):50944 #Correct:3999 #Trained:6001 Training Accuracy:66.6%


In [56]:
mlp.test(tweets[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Tested:1 Testing Accuracy:0.0%Progress:0.1% Speed(reviews/sec):658.7 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1257. #Correct:1 #Tested:3 Testing Accuracy:33.3%Progress:0.3% Speed(reviews/sec):1814. #Correct:1 #Tested:4 Testing Accuracy:25.0%Progress:0.4% Speed(reviews/sec):2354. #Correct:1 #Tested:5 Testing Accuracy:20.0%Progress:0.5% Speed(reviews/sec):2880. #Correct:2 #Tested:6 Testing Accuracy:33.3%Progress:0.6% Speed(reviews/sec):3329. #Correct:2 #Tested:7 Testing Accuracy:28.5%Progress:0.7% Speed(reviews/sec):3808. #Correct:3 #Tested:8 Testing Accuracy:37.5%Progress:0.8% Speed(reviews/sec):4131. #Correct:3 #Tested:9 Testing Accuracy:33.3%Progress:0.9% Speed(reviews/sec):4506. #Correct:3 #Tested:10 Testing Accuracy:30.0%Progress:1.0% Speed(reviews/sec):4920. #Correct:3 #Tested:11 Testing Accuracy:27.2%Progress:1.1% Speed(reviews/sec):5290. #Correct:3 #Tested:12 Testing Accuracy:25.0%Pro

In [68]:
mlp = SentimentNetwork(tweets[:-1000],min_count=20,polarity_cutoff=0.5,learning_rate=0.3)
mlp.train(tweets[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:15.1% Speed(reviews/sec):66966 #Correct:717 #Trained:1001 Training Accuracy:71.6%
Progress:30.2% Speed(reviews/sec):68201 #Correct:1393 #Trained:2001 Training Accuracy:69.6%
Progress:45.3% Speed(reviews/sec):69029 #Correct:2113 #Trained:3001 Training Accuracy:70.4%
Progress:60.4% Speed(reviews/sec):70033 #Correct:2805 #Trained:4001 Training Accuracy:70.1%
Progress:75.6% Speed(reviews/sec):70379 #Correct:3498 #Trained:5001 Training Accuracy:69.9%
Progress:90.7% Speed(reviews/sec):69649 #Correct:4231 #Trained:6001 Training Accuracy:70.5%


In [69]:
mlp.test(tweets[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Tested:1 Testing Accuracy:0.0%Progress:0.1% Speed(reviews/sec):514.3 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):978.9 #Correct:1 #Tested:3 Testing Accuracy:33.3%Progress:0.3% Speed(reviews/sec):1405. #Correct:1 #Tested:4 Testing Accuracy:25.0%Progress:0.4% Speed(reviews/sec):1806. #Correct:1 #Tested:5 Testing Accuracy:20.0%Progress:0.5% Speed(reviews/sec):2205. #Correct:2 #Tested:6 Testing Accuracy:33.3%Progress:0.6% Speed(reviews/sec):2560. #Correct:3 #Tested:7 Testing Accuracy:42.8%Progress:0.7% Speed(reviews/sec):2930. #Correct:4 #Tested:8 Testing Accuracy:50.0%Progress:0.8% Speed(reviews/sec):3221. #Correct:4 #Tested:9 Testing Accuracy:44.4%Progress:0.9% Speed(reviews/sec):3541. #Correct:5 #Tested:10 Testing Accuracy:50.0%Progress:1.0% Speed(reviews/sec):3867. #Correct:5 #Tested:11 Testing Accuracy:45.4%Progress:1.1% Speed(reviews/sec):4177. #Correct:6 #Tested:12 Testing Accuracy:50.0%Pro

## Preparing the submission file

In [85]:
test_tweets = pd.read_csv('data/test.csv')
test_tweets.drop(labels=['keyword', 'location'], axis=1, inplace=True)
test_tweets.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [94]:
# pass test tweets through trained layers and save the labels with ids to file


with open('data/submission.csv', 'w') as f:
    f.write('id,target\n')
    
    for row in test_tweets.itertuples():
        label = mlp.run(row.text)
        f.write(f'{row.id},{label}\n')
        #print(f'{row.id},{label}')

## Sandbox: annalyzing the data

In [46]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [47]:
hist, edges = np.histogram(list(map(lambda x:x[1],dis_neu_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Disaster/Neutral Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

  hist, edges = np.histogram(list(map(lambda x:x[1],dis_neu_ratios.most_common())), density=True, bins=100, normed=True)


In [33]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [34]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

  hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)


## What's going on with the weights

In [36]:
mlp_full = SentimentNetwork(tweets[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)

In [37]:
mlp_full.train(tweets[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:15.1% Speed(reviews/sec):26521 #Correct:750 #Trained:1001 Training Accuracy:74.9%
Progress:30.2% Speed(reviews/sec):26686 #Correct:1413 #Trained:2001 Training Accuracy:70.6%
Progress:45.3% Speed(reviews/sec):27317 #Correct:2085 #Trained:3001 Training Accuracy:69.4%
Progress:60.4% Speed(reviews/sec):27782 #Correct:2771 #Trained:4001 Training Accuracy:69.2%
Progress:75.6% Speed(reviews/sec):27987 #Correct:3436 #Trained:5001 Training Accuracy:68.7%
Progress:90.7% Speed(reviews/sec):28086 #Correct:4180 #Trained:6001 Training Accuracy:69.6%


In [38]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [39]:
get_most_similar_words("fire")

[('in', 0.03268983461416591),
 ('of', 0.012387451435891756),
 ('after', 0.010723274432701567),
 ('fire', 0.009563245255325741),
 ('at', 0.00943844981786447),
 ('California', 0.009304743610198509),
 ('killed', 0.00914330534980527),
 ('over', 0.008976292117716033),
 ('from', 0.008864430357716079),
 ('bombing', 0.008211422282295957),
 ('suicide', 0.008104983356396117),
 ('Hiroshima', 0.007798100837870504),
 ('More', 0.007667191061387685),
 ('fatal', 0.006846256559909082),
 ('...', 0.006766200632788249),
 ('were', 0.006431896455731712),
 ('than', 0.0060996718679604735),
 ('Northern', 0.006061041595487962),
 ('crash', 0.005868197248212044),
 ('bomber', 0.005665644251436754),
 ('by', 0.005470280970583068),
 ('70', 0.0054451128605550025),
 ('near', 0.005258391847809156),
 ('train', 0.005163834035614486),
 ('debris', 0.005053629161068459),
 ('Japan', 0.004951391833494319),
 ('mass', 0.004772138464394244),
 ('In', 0.004735647106360973),
 ('-', 0.004664524566959294),
 ('disaster', 0.004563324971

In [40]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in dis_neu_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(dis_neu_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [41]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in dis_neu_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(dis_neu_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [42]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(np.array(vectors_list))

In [45]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
#p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words