## Word Embedding Comparisons [dependency2vec vs word2vec]

This is a staging notebook for testing the models produced by the different word embedding approaches being considered, word2vec and dependency2vec

In [1]:
import pandas as pd
import numpy as np
import spacy
from gensim import models, similarities
from tqdm import tqdm
import joblib
from sklearn.manifold import TSNE
from sklearn.decomposition import IncrementalPCA
import plotly as py
import plotly.graph_objs as go
from modules.utils.CustomTwokenizer import CustomTwokenizer

### Initialize requirements

In [2]:
spacy_en_model = "en_core_web_md"
spacy_glove_model = "en_vectors_glove_md"
nlp = spacy.load(spacy_glove_model, create_make_doc=CustomTwokenizer)
py.offline.init_notebook_mode(connected=True)

In [3]:
def most_similar(word, n):
    queries = [w for w in word.vocab if not (word.is_oov or word.is_punct or word.like_num or word.is_stop or word.lower_ == "rt") and w.has_vector and w.lower_ != word.lower_ and w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    cosine_score = [word.similarity(w) for w in by_similarity]
    return by_similarity[:n], cosine_score[:n]

### Test dependency2vec model against word2vec model (top _n_ results)

In [4]:
hatespeech_dep_word = "data/persistence/word_embeddings/dim200vecs_hatespeech_dep"
hs_candidates_exp6_word = "data/persistence/word_embeddings/dim200vecs_hs_candidates_exp6"
hatespeech_dep_context = "data/persistence/word_embeddings/dim200context-vecs_hatespeech_dep"
hs_candidates_exp6_context = "data/persistence/word_embeddings/dim200context-vecs_hs_candidates_exp6"

#### Load trained dependency2vec models

In [5]:
hatespeech_model = models.KeyedVectors.load_word2vec_format(hatespeech_dep_word, binary=False)
hs_candidates_exp6_model = models.KeyedVectors.load_word2vec_format(hs_candidates_exp6_word, binary=False)

#### Top _n_ ouput for annotated hatespeech model

In [6]:
hatespeech_model.similar_by_word("savages", topn=10, restrict_vocab=None)

[('wetbacks', 0.9985026717185974),
 ('something', 0.9980785846710205),
 ('nothing', 0.9980483055114746),
 ('chinks', 0.9975727796554565),
 ('dumb', 0.9975383281707764),
 ('stupid', 0.9971219301223755),
 ('nice', 0.9970505833625793),
 ('mouth', 0.9969522953033447),
 ('birthday', 0.9969327449798584),
 ("they're", 0.9969197511672974)]

#### Top _n_ ouput for hs_candidate model

In [7]:
hs_candidates_exp6_model.similar_by_word("savages", topn=10, restrict_vocab=None)
# hs_candidates_exp6_model.most_similar(positive=['nigger'], negative=['smart'], topn=15)
# hs_candidates_exp6_model.similarity('savages', 'muslim')

[('dumbasses', 0.9721543788909912),
 ('fags', 0.9646444320678711),
 ('thots', 0.9623584151268005),
 ('bullies', 0.9599213600158691),
 ('crybabies', 0.9590451717376709),
 ('twats', 0.9585676193237305),
 ('lesbians', 0.9585357904434204),
 ('hypocrites', 0.9576173424720764),
 ('anarchists', 0.95649254322052),
 ('creatures', 0.9542888402938843)]

#### Top _n_ ouput for gloVe Common Crawl model (word2vec)

In [8]:
word = nlp.vocab[u'savages']
gloVe_result = most_similar(word, 10)
for res in zip(gloVe_result[0], gloVe_result[1]):
    print((res[0].lower_, res[1]))

('civilized', 0.61444840037822668)
('bastards', 0.51958825149203092)
('bigots', 0.50898474635928193)
('racists', 0.50549636015559973)
('ignorant', 0.50096436094533614)
('muslims', 0.49439498127830261)
('uneducated', 0.48886437758990831)
('rednecks', 0.48630034151617468)
('morons', 0.47793725834430528)
('primitive', 0.47348814316940263)


### Visualize the embedding space

Here we want to reduce the dimensionality of the word vectors in the various models using [tsne](http://lvdmaaten.github.io/tsne/) and plot the results to a 2D space in order to evauluate the quality of the models.

- http://nlp.yvespeirsman.be/blog/visualizing-word-embeddings-with-tsne/
- https://www.jeffreythompson.org/blog/2017/02/13/using-word2vec-and-tsne/

#### Setup PCA reduce function

It is highly recommended to use another dimensionality reduction method to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples. This makes our vector space more manageable before running tsne.

In [9]:
gloVe_model_pca_reduction = 'data/persistence/word_embeddings/gloVe_model_pca_reduction.pkl.compressed'
hatespeech_model_pca_reduction = 'data/persistence/word_embeddings/hatespeech_model_pca_reduction.pkl.compressed'
hs_candidates_exp6_model_pca_reduction = 'data/persistence/word_embeddings/hs_candidates_exp6_model_pca_reduction.pkl.compressed'

gloVe_model_tsne_reduction = 'data/persistence/word_embeddings/gloVe_model_tsne_reduction.pkl.compressed'
hatespeech_model_tsne_reduction = 'data/persistence/word_embeddings/hatespeech_model_tsne_reduction.pkl.compressed'
hs_candidates_exp6_model_tsne_reduction = 'data/persistence/word_embeddings/hs_candidates_exp6_model_tsne_reduction.pkl.compressed'

def pca_reduction(vectors, num_dimensions, model_name):
    print('Reducing to ' + str(num_dimensions) + 'D using IncrementalPCA...')
    ipca = IncrementalPCA(n_components=num_dimensions)
    vectors = ipca.fit_transform(vectors)
    joblib.dump(vectors, model_name, compress=True)
    print('Done')
    return vectors

#### Setup plot function

In [10]:
def plot_word_embedding(X_tsne,vocab,chart_title,show_labels):
    if show_labels:
        display_mode = 'markers+text'
        display_text = vocab
    else:
        display_mode = 'markers'
        display_text = None
    
    trace = go.Scatter(
    x = X_tsne[:, 0],
    y = X_tsne[:, 1],
    mode = display_mode,
    text = display_text,
    marker= dict(size= 14,
                line= dict(width=0.5),
                opacity= 0.3,
                color = 'rgba(217, 217, 217, 0.14)'
            )
    )
    data = [trace]
    py.offline.iplot({
        "data": data,
        "layout": go.Layout(title=chart_title)
    })

#### 1.1 Load gloVe embedding

In [11]:
gloVe_vectors = list({w for w in nlp.vocab if w.has_vector and w.orth_.islower()})
gloVe_vector_subset = gloVe_vectors[:10000]
gloVe_vocab = [w.lower_ for w in gloVe_vector_subset]
print("GloVe vocab length: " + str(len(gloVe_vocab)))
gloVe_X = [w.vector for w in gloVe_vector_subset]

GloVe vocab length: 1000


#### 1.2 Run PCA reduction

In [12]:
try:
    gloVe_reduced_vectors = joblib.load(gloVe_model_pca_reduction)
except IOError:
    gloVe_reduced_vectors = pca_reduction(gloVe_X, 20, gloVe_model_pca_reduction)

Reducing to 20D using IncrementalPCA...
Done


#### 1.3 Reduce with TSNE

In [13]:
try:
    gloVe_X_tsne = joblib.load(gloVe_model_tsne_reduction)
except IOError:
    gloVe_tsne = TSNE(n_components=2)
    gloVe_X_tsne = gloVe_tsne.fit_transform(gloVe_reduced_vectors)
    joblib.dump(gloVe_X_tsne, gloVe_model_tsne_reduction, compress=True)

#### 1.4 Visualize model in 2D space

In [14]:
plot_word_embedding(gloVe_X_tsne, gloVe_vocab, "GloVe Common Crawl Corpus [word2vec]", False)

#### 2.1 Load the annotated hatespeech model embedding

In [15]:
hatespeech_model_vectors = hatespeech_model[hatespeech_model.vocab]
hatespeech_model_X = hatespeech_model_vectors
hatespeech_model_vocab = list(hatespeech_model.vocab.keys())
print("Hatespeech model vocab length: " + str(len(hatespeech_model_vocab)))

Hatespeech model vocab length: 476


#### 2.2 Run PCA reduction

In [16]:
try:
    hatespeech_reduced_vectors = joblib.load(hatespeech_model_pca_reduction)
except IOError:
    hatespeech_reduced_vectors = pca_reduction(hatespeech_model_X, 20, hatespeech_model_pca_reduction)

Reducing to 20D using IncrementalPCA...
Done


#### 2.3 Reduce with TSNE

In [17]:
try:
    hatespeech_model_X_tsne = joblib.load(hatespeech_model_tsne_reduction)
except IOError:
    hatespeech_model_tsne = TSNE(n_components=2)
    hatespeech_model_X_tsne = hatespeech_model_tsne.fit_transform(hatespeech_reduced_vectors)
    joblib.dump(hatespeech_model_X_tsne, hatespeech_model_tsne_reduction, compress=True)

#### 2.4 Visualize model in 2D space

In [18]:
plot_word_embedding(hatespeech_model_X_tsne, hatespeech_model_vocab, "Annotated Hatespeech Model [dependency2vec]", True)

#### 3.1 Load the hatespeech candidate model embedding

In [23]:
hs_candidates_exp6_model_vectors = hs_candidates_exp6_model[hs_candidates_exp6_model.vocab]
hs_candidates_exp6_model_X = hs_candidates_exp6_model_vectors[:10812]
hs_candidates_exp6_model_vocab = list(hs_candidates_exp6_model.vocab.keys())[:10812]
print("Hatespeech candidates model vocab length: " + str(len(hs_candidates_exp6_model_vocab)))

Hatespeech candidates model vocab length: 10812


#### 3.2 Run PCA reduction

In [20]:
try:
    hs_candidates_exp6_reduced_vectors = joblib.load(hs_candidates_exp6_model_pca_reduction)
except IOError:
    hs_candidates_exp6_reduced_vectors = pca_reduction(hs_candidates_exp6_model_X, 20, hs_candidates_exp6_model_pca_reduction)

Reducing to 20D using IncrementalPCA...
Done


#### 3.3 Reduce with TSNE

In [21]:
try:
    hs_candidates_exp6_model_X_tsne = joblib.load(hs_candidates_exp6_model_tsne_reduction)
except IOError:
    hs_candidates_exp6_model_tsne = TSNE(n_components=2)
    hs_candidates_exp6_model_X_tsne = hs_candidates_exp6_model_tsne.fit_transform(hs_candidates_exp6_reduced_vectors)
    joblib.dump(hs_candidates_exp6_model_X_tsne, hs_candidates_exp6_model_tsne_reduction, compress=True)

#### 3.4 Visualize the model in 2D space

In [22]:
plot_word_embedding(hs_candidates_exp6_model_X_tsne, hs_candidates_exp6_model_vocab, "Hatespeech Candidate Model [dependency2vec]", False)