## Word Embedding Comparisons [dependency2vec vs word2vec]

This is a staging notebook for testing the models produced by the different word embedding approaches being considered, word2vec and dependency2vec

In [9]:
import pandas as pd
import numpy as np
import spacy
from gensim import models, similarities
from tqdm import tqdm
from sklearn.manifold import TSNE
import plotly as py
import cufflinks as cf
import plotly.graph_objs as go
from modules.utils.CustomTwokenizer import CustomTwokenizer

### Initialize requirements

In [10]:
spacy_en_model = "en_core_web_md"
spacy_glove_model = "en_vectors_glove_md"
crowdflower_persistence_raw = 'data/persistence/df/crowdflower_features_raw.pkl.compressed'
crowdflower_persistence = 'data/persistence/df/crowdflower_features.pkl.compressed'
nlp = spacy.load(spacy_en_model, create_make_doc=CustomTwokenizer)
py.offline.init_notebook_mode(connected=True)
# cf.go_offline()

In [3]:
def most_similar(word, n):
    queries = [w for w in word.vocab if not (word.is_oov or word.is_punct or word.like_num or word.is_stop or word.lower_ == "rt") and w.has_vector and w.lower_ != word.lower_ and w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    cosine_score = [word.similarity(w) for w in by_similarity]
    return by_similarity[:n], cosine_score[:n]

### Load and test dependency2vec model

In [14]:
hatespeech_dep_word = "data/persistence/df/dim200vecs_hatespeech_dep"
hs_candidates_exp6_word = "data/persistence/df/dim200vecs_hs_candidates_exp6"
hatespeech_dep_context = "data/persistence/df/dim200context-vecs_hatespeech_dep"
hs_candidates_exp6_context = "data/persistence/df/dim200context-vecs_hs_candidates_exp6"

In [15]:
hatespeech_model = models.KeyedVectors.load_word2vec_format(hatespeech_dep_word, binary=False)
hs_candidates_exp6_model = models.KeyedVectors.load_word2vec_format(hs_candidates_exp6_word, binary=False)

In [37]:
hatespeech_model.similar_by_word("savages", topn=20, restrict_vocab=None)

[('wetbacks', 0.9985026717185974),
 ('something', 0.9980785846710205),
 ('nothing', 0.9980483055114746),
 ('chinks', 0.9975727796554565),
 ('dumb', 0.9975383281707764),
 ('stupid', 0.9971219301223755),
 ('nice', 0.9970505833625793),
 ('mouth', 0.9969522953033447),
 ('birthday', 0.9969327449798584),
 ("they're", 0.9969197511672974),
 ('ok', 0.9969000816345215),
 ('wallet', 0.9968050122261047),
 ("what's", 0.996655285358429),
 ('whites', 0.9965375661849976),
 ('gonna', 0.9962338209152222),
 ('ppl', 0.9962177276611328),
 ('cuz', 0.9960685968399048),
 ('wrong', 0.9960306882858276),
 ('also', 0.9955644607543945),
 ('beautiful', 0.9955433011054993)]

In [51]:
# hs_candidates_exp6_model.most_similar(positive=['nigger'], negative=['smart'], topn=15)
hs_candidates_exp6_model.similarity('savages', 'muslim')
# hs_candidates_exp6_model.similar_by_word("savage", topn=20, restrict_vocab=None)

0.68546251287656113

In [27]:
vocab = list(hs_candidates_exp6_model.vocab.keys())
len(vocab)
hs_candidates_exp6_model.similarity('woman', 'man')

0.74220567500300816

### Load and test word2vec model

In [5]:
word = nlp.vocab[u'bitches']
gloVe_result = most_similar(word, 20)
for res in zip(gloVe_result[0], gloVe_result[1]):
    print((res[0].lower_, res[1]))

### Visualize model
https://github.com/jeffThompson/Word2VecAndTsne

https://www.quora.com/How-do-I-visualise-word2vec-word-vectors

In [88]:
X = hatespeech_model[hatespeech_model.vocab]
vocab = list(hatespeech_model.vocab.keys())
vocab = vocab[:50]
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X[:50,:])

In [89]:
len(X_tsne)

50

In [90]:
# http://nlp.yvespeirsman.be/blog/visualizing-word-embeddings-with-tsne/ evaluation
trace = go.Scatter(
    x = X_tsne[:, 0],
    y = X_tsne[:, 1],
    mode = 'markers+text',
    text = vocab,
    marker= dict(size= 14,
                line= dict(width=0.5),
                opacity= 0.3,
                color = 'rgba(217, 217, 217, 0.14)'
            )
)
data = [trace]
py.offline.iplot({
    "data": data,
    "layout": go.Layout(title="Annotated hate speech dep2vec")
})