# Dependency2vec code word identification

We experiment with methods for evaluating code words based on average cosine similarity and traditional word embeddings

- See issue [#85](https://github.com/JherezTaylor/thesis-preprocessing/issues/85) and [#91](https://github.com/JherezTaylor/thesis-preprocessing/issues/91)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob, os
from pprint import pprint
import joblib
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from modules.db import elasticsearch_base
from modules.preprocessing import neural_embeddings
from modules.utils import file_ops, model_helpers, settings

## Initialize params and objects

Here we define common functions for loading our embeddings and extracting the vocabulary and vocabulary counts. ft_word_embeddings and w2v_word_embeddings each store a list of references to embedding models that exist on disk.

#### Method definitions

In [3]:
def get_raw_vocab(df):
    vect = CountVectorizer(analyzer='word', stop_words='english')
    X = df["text"]
    fit_result = vect.fit(X)
    vocabulary = fit_result.vocabulary_
    return len(vocabulary), vocabulary

#### Load models

In [4]:
embeddings = neural_embeddings.get_embeddings("kv", model_ids=[5,0,7,3,4], load=True)
if embeddings:
    dep2vec_twitter = embeddings[0] 
    dep2vec_dstormer = embeddings[1]
    dep2vec_ustream = embeddings[2]
    dep2vec_manchester = embeddings[3]
    dep2vec_melvyn_hs = embeddings[4]

0 dim200vecs_dstormer_conll
1 dim200vecs_hs_candidates_exp6
2 dim200vecs_inaug_conll
3 dim200vecs_manch_conll
4 dim200vecs_melvynhs_conll
5 dim200vecs_twitter_conll
6 dim200vecs_uselec_conll
7 dim200vecs_ustream_conll


#### Load dataframes and other objects

In [5]:
_es = elasticsearch_base.connect(settings.ES_URL)
# df_naacl = joblib.load(settings.NAACL_2016_DATA)
# df_nlp = joblib.load(settings.NLP_2016_DATA)
# df_crwdflr = joblib.load(settings.CRWDFLR_DATA)

hs_keywords = set(file_ops.read_csv_file("hate_1", settings.TWITTER_SEARCH_PATH) +
              file_ops.read_csv_file("hate_2", settings.TWITTER_SEARCH_PATH) +
              file_ops.read_csv_file("hate_3", settings.TWITTER_SEARCH_PATH))

## Let's get to work

### Get word frequencies from corpuses

#### Manchester event

In [6]:
result = elasticsearch_base.aggregate(_es, "manchester_event", "tokens.keyword", False, size=100000, min_doc_count=5)
print("Manchester tweet count: {0}".format(result[1]))
manchester_hs, manchester_vocab = model_helpers.get_els_word_probabilities(result[0], result[1])

Manchester tweet count: 617698


In [7]:
candidate_codewords = model_helpers.select_candidate_codewords(dep2vec_manchester, manchester_vocab, hs_keywords)
pprint(candidate_codewords)

{'22.05.17': 6e-05,
 'abbot': 5e-05,
 'absurdly': 4e-05,
 'activist': 8e-05,
 'addresses': 0.00046,
 'amendment': 5e-05,
 'anxious': 0.00013,
 'asleep': 8e-05,
 'asshole': 0.00026,
 'axe': 6e-05,
 'background': 0.00031,
 'banksy': 2e-05,
 'barack’s': 9e-05,
 'barbershop': 0.00014,
 'bastard': 0.00029,
 'bearing': 2e-05,
 'billionaire': 0.00019,
 'bitch': 0.00066,
 'bleeds': 6e-05,
 'blocks': 6e-05,
 'boy': 0.00134,
 'brunch': 4e-05,
 'bryan': 0.00014,
 'bull': 0.0001,
 'bullies': 3e-05,
 'cake': 0.00016,
 'catastrophic': 3e-05,
 'chick': 0.0001,
 'choice': 0.00051,
 'clips': 6e-05,
 'coalition': 0.0001,
 'com': 0.00119,
 'comfortable': 8e-05,
 'comp': 4e-05,
 'congratulations': 0.00066,
 'consolation': 2e-05,
 'convenient': 8e-05,
 'corbyns': 6e-05,
 'couldnt': 9e-05,
 'creature': 4e-05,
 'cuck': 8e-05,
 'culpable': 4e-05,
 'cunt': 0.00066,
 'dave': 0.00013,
 'dealings': 6e-05,
 'dean': 0.0002,
 'delays': 0.00015,
 'dementia': 8e-05,
 'democrat': 0.00011,
 'deranged': 5e-05,
 'dj': 0.0

In [8]:
candidate_codewords = model_helpers.select_candidate_codewords(dep2vec_melvyn_hs, manchester_vocab, hs_keywords)
pprint(candidate_codewords)

{'abou': 7e-05,
 'academic': 5e-05,
 'accident': 0.00019,
 'agai': 2e-05,
 'alpha': 1e-05,
 'alternate': 3e-05,
 'animal': 0.0001,
 'ape': 2e-05,
 'aryan': 1e-05,
 'asshat': 3e-05,
 'asshole': 0.00026,
 'atheist': 4e-05,
 'author': 0.00013,
 'bait': 5e-05,
 'bastard': 0.00029,
 'battery': 0.00039,
 'bigot': 8e-05,
 'bird': 7e-05,
 'bitch': 0.00066,
 'blonde': 6e-05,
 'bonus': 0.00011,
 'boot': 9e-05,
 'boring': 0.0002,
 'bounty': 1e-05,
 'brush': 3e-05,
 'buffoon': 6e-05,
 'bull': 0.0001,
 'bureau': 1e-05,
 'burger': 4e-05,
 'busy': 0.00067,
 'celeb': 0.00011,
 'celebrity': 0.00029,
 'changer': 2e-05,
 'chicken': 9e-05,
 'classroom': 1e-05,
 'clock': 0.00017,
 'coincidence': 0.00019,
 'collective': 0.00011,
 'comedy': 0.00022,
 'commie': 3e-05,
 'complex': 0.00014,
 'cooler': 2e-05,
 'cow': 0.00018,
 'crazy': 0.00087,
 'creed': 2e-05,
 'cuck': 8e-05,
 'cunt': 0.00066,
 'dan': 0.00044,
 'dance': 0.00039,
 'dawn': 0.00031,
 'dea': 5e-05,
 'delay': 0.00011,
 'dictionary': 1e-05,
 'disease

#### Dailystormer

In [9]:
result = elasticsearch_base.aggregate(_es, "dailystormer", "tokens.keyword", False, size=10000, min_doc_count=5)
print("Dailystormer document count: {0}".format(result[1]))
dailystormer_hs, dailystormer_vocab = model_helpers.get_els_word_probabilities(result[0], result[1])
pprint(dailystormer_hs)

Dailystormer document count: 26015
{'ape': 0.00023,
 'apple': 0.00027,
 'bitch': 0.00062,
 'faggot': 0.00023,
 'guinea': 0.00019,
 'idiot': 0.00023,
 'kike': 0.0005,
 'monkey': 0.00031,
 'negro': 0.00104,
 'nigger': 0.00027,
 'property': 0.00092,
 'pussy': 0.00019,
 'queen': 0.00023,
 'retarded': 0.00027,
 'whitey': 0.00023}


#### melvyn_hs_users

In [10]:
result = elasticsearch_base.aggregate(_es, "melvyn_hs", "tokens.keyword", False, size=15000, min_doc_count=5)
print("Melvyn HS tweet count: {0}".format(result[1]))
melvyn_users_hs, melvyn_hs_vocab = model_helpers.get_els_word_probabilities(result[0], result[1])
pprint(melvyn_hs_vocab['faggot'])

Melvyn HS tweet count: 328627
0.00119


#### unfiltered_stream

In [11]:
result = elasticsearch_base.aggregate(_es, "unfiltered_stream", "tokens.keyword", False, size=15000, min_doc_count=10)
print("Unfiltered stream tweet count: {0}".format(result[1]))
unfiltered_stream_hs, unfiltered_stream_vocab = model_helpers.get_els_word_probabilities(result[0], result[1])
pprint(unfiltered_stream_vocab['faggot'])

Unfiltered stream tweet count: 3241381
6e-05


#### core_tweets

In [12]:
result = elasticsearch_base.aggregate(_es, "core_tweets", "tokens.keyword", False, size=20000, min_doc_count=5)
print("core_tweets tweet count: {0}".format(result[1]))
core_tweets_hs, core_tweets_vocab = model_helpers.get_els_word_probabilities(result[0], result[1])
pprint(core_tweets_hs)

core_tweets tweet count: 6843555
{'abo': 0.00019,
 'af': 0.02696,
 'albino': 0.00032,
 'ape': 0.00105,
 'apple': 0.02184,
 'azn': 3e-05,
 'banana': 0.00292,
 'beaner': 8e-05,
 'bint': 5e-05,
 'bird': 0.00616,
 'bitch': 0.05996,
 'blockhead': 3e-05,
 'bogan': 4e-05,
 'bong': 0.00061,
 'boo': 0.01058,
 'boon': 0.00026,
 'boong': 5e-05,
 'brownie': 0.00069,
 'bubble': 0.004,
 'buck': 0.00191,
 'buffie': 4e-05,
 'bumblebee': 0.00011,
 'bung': 4e-05,
 'bunga': 4e-05,
 'celestial': 0.00025,
 'chav': 0.00015,
 'chink': 0.0001,
 'chug': 0.00025,
 'chunky': 0.00118,
 'clam': 0.00025,
 'cocoa': 0.00083,
 'coconut': 0.00183,
 'colored': 0.00141,
 'coloured': 0.0005,
 'coolie': 2e-05,
 'coon': 0.00047,
 'cracker': 0.00096,
 'cripple': 0.00018,
 'crow': 0.00118,
 'cunt': 0.00559,
 'dago': 2e-05,
 'dink': 8e-05,
 'div': 0.00056,
 'divvy': 3e-05,
 'domes': 4e-05,
 'dyke': 0.00042,
 'egg': 0.0046,
 'eggplant': 0.0002,
 'fag': 0.00074,
 'faggot': 0.0015,
 'fez': 0.00011,
 'frog': 0.00163,
 'fruit': 0.0

In [13]:
cosine_similarities = []

In [14]:
# get_word_count(hs_candidates_exp6_model, "fuck")
# hs_candidates_exp6_model_vocab = li`st(hs_candidates_exp6_model.vocab.keys())
# print(hs_candidates_exp6_model_vocab)
# hs_candidates_exp6_model.similar_by_word("savages", topn=10, restrict_vocab=None)

In [15]:
if embeddings:
    print("\nMain Twitter set")
    pprint(dep2vec_twitter.similar_by_word("bomber", topn=10, restrict_vocab=None))
    print("\nDaily Stormer")
    pprint(dep2vec_dstormer.similar_by_word("savages", topn=10, restrict_vocab=None))
    print("\nMelvyn HS users")
    pprint(dep2vec_melvyn_hs.similar_by_word("savages", topn=10, restrict_vocab=None))
    print("\nUnfiltered stream")
    pprint(dep2vec_ustream.similar_by_word("savages", topn=10, restrict_vocab=None))
    print("\nManchester")
    pprint(dep2vec_manchester.similar_by_word("bomber", topn=10, restrict_vocab=None))


Main Twitter set
[('blazer', 0.9450125694274902),
 ('windbreaker', 0.937454879283905),
 ('linen', 0.930109977722168),
 ('#shirt', 0.9292298555374146),
 ('hoody', 0.9283322095870972),
 ('zipper', 0.9258227348327637),
 ('trench', 0.9247759580612183),
 ('zip-up', 0.9242134094238281),
 ('shearling', 0.9210963249206543),
 ('vest', 0.9190042018890381)]

Daily Stormer
[('monkeys', 0.9646297693252563),
 ('monsters', 0.9601025581359863),
 ('thugs', 0.9567341208457947),
 ('parasites', 0.9543370604515076),
 ('illegals', 0.9494255185127258),
 ('bastards', 0.9487513303756714),
 ('barbarians', 0.947960615158081),
 ('apes', 0.9478895664215088),
 ('actors', 0.9475318193435669),
 ('gentiles', 0.9470034241676331)]

Melvyn HS users
[('degenerates', 0.8997732400894165),
 ('rings', 0.8950327634811401),
 ('assholes', 0.8936536908149719),
 ('invaders', 0.8933420181274414),
 ('barbarians', 0.8886228203773499),
 ('brands', 0.8879308700561523),
 ('cunts', 0.8854016661643982),
 ('animals', 0.8852415084838867),


In [40]:
temp = dep2vec_twitter.similar_by_word("nigger", topn=5, restrict_vocab=None)

In [42]:
temp2 = [a for a in temp if a[0] in hs_keywords]

In [43]:
temp2

[('spic', 0.7711798548698425),
 ('faggot', 0.7599295377731323),
 ('wetback', 0.7536832690238953),
 ('beaner', 0.747124433517456),
 ('kike', 0.7392979860305786)]

In [41]:
def compute_avg_cosine(similarity_result):
    cosine_vals = [cos[1] for cos in similarity_result]
    avg_cosine = sum(cosine_vals) / len(cosine_vals)
    return avg_cosine

0.754243016242981
[('spic', 0.7711798548698425),
 ('faggot', 0.7599295377731323),
 ('wetback', 0.7536832690238953),
 ('beaner', 0.747124433517456),
 ('kike', 0.7392979860305786)]
