In [None]:
# Import relevant modules
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models.phrases import Phrases, Phraser
import scipy
import mibian
import nltk
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
from nrclex import NRCLex
from pyemd import emd

## Word2Vec Word Embeddings

To re-run the analysis, load your data and name it `gafa_data`.
It should entail a `.txt` file with each line containing pre-processed text corresponding to one Reddit comment. 

In [None]:
# Load data
gafa_data = pd.read_csv('working_directory',header=None)

In [None]:
# Remove empty rows
gafa_data = gafa_data[gafa_data[0].notnull()]

In [None]:
# Create sentences for W2V model; keep bigrams 
# as seen in https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
sent = [row.split() for row in gafa_data[0]]
phrases = Phrases(sent, min_count=10, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
# Train model
w2v_model = Word2Vec(sentences, 
                     min_count=10,
                     window=10,
                     size=300,
                     sg=1,
                     workers=3)

In [None]:
# Obtain word vectors
word_vectors = w2v_model.wv

In [None]:
# Cosine similarity between words
print(word_vectors.similarity('google', 'amazon'))
print(word_vectors.similarity('google', 'facebook'))
print(word_vectors.similarity('google', 'apple'))
print(word_vectors.similarity('amazon', 'facebook'))
print(word_vectors.similarity('amazon', 'apple'))
print(word_vectors.similarity('facebook', 'apple'))

In [None]:
# The word further away from the mean of all word vectors
word_vectors.doesnt_match(['google', 'amazon', 'facebook', 'apple'])

In [None]:
# Tme most senamtically similar words
google_words = word_vectors.most_similar('google')
google_words

In [None]:
amazon_words = word_vectors.most_similar('amazon')
amazon_words

In [None]:
facebook_words = word_vectors.most_similar('facebook')
facebook_words

In [None]:
apple_words = word_vectors.most_similar('apple')
apple_words

In [None]:
ca_words = word_vectors.most_similar('cambridge_analytica')
ca_words

In [None]:
# Cosine similarities to Cambridge Analytica
print(word_vectors.similarity('google', 'cambridge_analytica'))
print(word_vectors.similarity('amazon', 'cambridge_analytica'))
print(word_vectors.similarity('facebook', 'cambridge_analytica'))
print(word_vectors.similarity('apple', 'cambridge_analytica'))

In [None]:
# Cosine distances from Cambridge Analytica. I.e. 1 - similarity().
print(word_vectors.distances('cambridge_analytica',
                ['google','amazon','facebook','apple']))

### t-SNE Plot

Plot the 15 most similar words to the words contained in the `keys` list within a 2-dimensional space as seen in https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d .


In [None]:
keys = ['google','amazon','facebook','apple','cambridge_analytica']

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in w2v_model.wv.most_similar(word, topn=15):
        words.append(similar_word)
        embeddings.append(w2v_model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)


In [None]:
plt.style.use('classic')
plt.rcParams['grid.color'] = 'white'
plt.rcParams['legend.fancybox'] = True
plt.rcParams['axes.axisbelow'] = False
plt.rcParams['figure.facecolor'] = 'white'

In [None]:
def tsne_plot_similar_words(title, labels, embedding_clusters, 
                            word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    
    ax = plt.subplot()
    ax.set_xlim(-4, 16)
    ax.set_ylim(-20.5,-2.5)
    
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color.reshape(1,-1), alpha=a, label=label, edgecolors='face')
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.8, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=12)
    plt.legend(loc=4, fontsize = 12)
    plt.title(title)
    plt.grid(True)
    plt.xticks([-2.5,0,2.5,5,7.5,10,12.5,15])
    plt.yticks([-20,-17.5,-15,-12.5,-10,-7.5,-5,-2.5])
    
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


In [None]:
tsne_plot_similar_words("", 
                        keys, embeddings_en_2d, word_clusters, 1,
                       'classic_plot.png')