In [1]:
import numpy 
import pickle
import operator
# import smart_open
# import gensim
# from gensim.models import Word2Vec
# from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import csv

In [6]:
# loading the vocabulary (list)

### code taken from https://stackoverflow.com/questions/24906126/how-to-unpack-pkl-file
 
with open('all-english/1960-vocab.pkl', 'rb') as f:
    vocab_1960 = pickle.load(f)
with open('all-english/1990-vocab_ae.pkl', 'rb') as f:
    vocab_1990_ae = pickle.load(f)
with open('sgns/1990-vocab.pkl', 'rb') as f:
    vocab_1990 = pickle.load(f)
with open('sgns/2000-vocab.pkl', 'rb') as f:
    vocab_2000 = pickle.load(f)
    
# loading the models (numpy array)

model_1960 = numpy.load('all-english/1960-w.npy')
model_1990_ae = numpy.load('all-english/1990-w_ae.npy')
model_1990 = numpy.load('sgns/1990-w.npy')
model_2000 = numpy.load('sgns/2000-w.npy')

In [8]:
# getting individual vectors
model_vectors_1960 = numpy.split(model_1960, 100000)
model_vectors_1990_ae = numpy.split(model_1990_ae, 100000)
model_vectors_1990 = numpy.split(model_1990, 50000)
model_vectors_2000 = numpy.split(model_2000, 50000)
# creating dicts for easy access

full_model_1960 = dict()

for vector, word in zip(model_vectors_1960, vocab_1960):
    full_model_1960[word] = vector
full_model_1990_ae = dict()
for vector, word in zip(model_vectors_1990_ae, vocab_1990_ae):
    full_model_1990_ae[word] = vector
full_model_1990 = dict()
for vector, word in zip(model_vectors_1990, vocab_1990):
    full_model_1990[word] = vector
full_model_2000 = dict()
for vector, word in zip(model_vectors_2000, vocab_2000):
    full_model_2000[word] = vector

In [9]:

with open("D:/2000_labels.tsv", "w", encoding = "utf-8") as f:
    for word in vocab_2000:
        f.write(word+"\n")

import pandas as pd 
pd.DataFrame(model_2000).to_csv("D:/2000_vectors.tsv", sep = "\t", header = None, index = None)

Comments:
- data propocessing: compounds were taken out and all terms were made lowercase
- final_results is a list of two dictionaries; the first one has the 1960s results, the second one the 1990s
- in the dictionaries the keys are the words and the values are the cosine similarities (the terms are NOT separated by target/control_rel/control_unrel; but this data can be easily extracted as they are in that order)
- unknown_words is a list of terms that were not present in the models (NOT separated by model; but they are in the order of first 1960s and then 1990s)


In [10]:
#print twenty most similar words to genocide per decade

models = [full_model_1960, full_model_1990_ae,
          full_model_1990, full_model_2000]
decades = ["1960", "1990_ae", "1990", "2000"]
wordlists=list()
target_terms = ["holocaust","nsdap", "deportation", "crime", "communism", "colonialism", "nuremberg", "nazi", "jew", "hitler", "ss", "eichmann", "gestapo", "nazism", "fascism", "pogrom", 
                "serbia", 'yugoslavia', "balkan", "srebrenica", "bosnia", "muslim", "rwanda", "armenia", "cambodia", "tutsi", "hutu", 'icty', "ictr",
               "reconciliation", "restitution", "meat", "industry", "veganism", "vegetarianism", "apartheid", "europe", "refugee", "past", "present", "shoah", "porajmos", "culture", "stalin", "soviet", "ussr", "ukraine",
               "holodomor", "injustice", "myth", "narrative", "climate", "emissions", "plant", "animal", "ecosystem"]
control_rel_terms = ['war', "murder", 'massacre', 'extermination', "camp", 'uniform', 'violence', 'tribunal', 'prosecution', 'trial', 'atrocity', 'starvation', 'disease', 'epidemic', 'death', "power", "system", "grief", "disaster"]
control_unrel_terms = ['dog', 'table', 'bread', 'dress', 'tree', 'river', 'lamp', 'pen', 'sea', 'sky', 'moon', 'wood', 'milk', 'socks', 'house', 'blue', 'green', 'window', 'glasses', 'football', 'mug', 'bottle', "spoon"]

wordlists.append(target_terms)
wordlists.append(control_rel_terms)
wordlists.append(control_unrel_terms)
unknown_words = list()
final_results = list()
i = 0

for model in models:
    results=dict()
    genocide = numpy.array(model['genocide']).reshape(1,-1)
    for wordlist in wordlists:
        for word in wordlist:
            if word in model.keys():
                word_vector = numpy.array(model[word]).reshape(1,-1)
                result = cosine_similarity(genocide, word_vector)
                results[f'{word}']=result
                sorted_d = sorted(results.items(), key=operator.itemgetter(1), reverse = True)[:20]
                ##@Aga I sorted the dict by highest values to see which terms were most similar
            else:
                unknown_words.append(word)
    final_results.append(decades[i])
    final_results.append(sorted_d)
    final_results.append( "-"*150)
    i+=1
    
for k in final_results:
    print(k)

1960
[('crime', array([[0.29177401]])), ('murder', array([[0.27885793]])), ('atrocity', array([[0.26889012]])), ('injustice', array([[0.22599845]])), ('nuremberg', array([[0.22277304]])), ('gestapo', array([[0.21403559]])), ('restitution', array([[0.21149945]])), ('eichmann', array([[0.19583105]])), ('stalin', array([[0.18079849]])), ('tribunal', array([[0.17988185]])), ('extermination', array([[0.16833657]])), ('violence', array([[0.16784827]])), ('deportation', array([[0.16416615]])), ('hitler', array([[0.15321625]])), ('apartheid', array([[0.148776]])), ('ss', array([[0.14656818]])), ('soviet', array([[0.1392148]])), ('refugee', array([[0.13912779]])), ('war', array([[0.13565476]])), ('trial', array([[0.13546953]]))]
------------------------------------------------------------------------------------------------------------------------------------------------------
1990_ae
[('holocaust', array([[0.43887838]])), ('extermination', array([[0.40110794]])), ('rwanda', array([[0.37295615]

In [23]:
# cosine similarity of all target terms to "genocide"
target_terms = ["holocaust","nsdap", "deportation", "crimes", "communism", "colonialism", "nuremberg", "nazi", "jew", "hitler", "ss", "eichmann", "gestapo", "nazism", "fascism", "pogrom", 
                "cleansing", "serbia", 'yugoslavia', "balkan", "srebrenica", "bosnia", "muslim", "rwanda", "armenia", "cambodia", "tutsi", "hutu", 'icty', "ictr",
               "reconciliation", "restitution", "meat", "industry", "veganism", "vegetarianism", "apartheid", "europe", "refugee", "past", "present", "shoah", "porajmos", "culture", "stalin", "soviet", "ussr", "ukraine",
               "holodomor", "injustice", "myth", "narrative", "climate", "emissions", "plant", "animal", "ecosystem"]

# decades = ["1960", "1990_ae", "1990", "2000"]

unknown_words = list()
final_results = list()
i = 0



results_1960 = []
results_1990_ae = []
results_1990 = []
results_2000 = []

for word in target_terms:
    if word in full_model_1960.keys():
        genocide = numpy.array(full_model_1960['genocide']).reshape(1,-1)
        word_vector = numpy.array(full_model_1960[word]).reshape(1,-1)
        result = cosine_similarity(genocide, word_vector)
        results_1960.append(float(result))
        
    if word in full_model_1990_ae.keys():
        genocide = numpy.array(full_model_1990_ae['genocide']).reshape(1,-1)
        word_vector = numpy.array(full_model_1990_ae[word]).reshape(1,-1)
        result = cosine_similarity(genocide, word_vector)
        results_1990_ae.append(float(result))
        
    if word in full_model_1990.keys():
        genocide = numpy.array(full_model_1990['genocide']).reshape(1,-1)
        word_vector = numpy.array(full_model_1990[word]).reshape(1,-1)
        result = cosine_similarity(genocide, word_vector)
        results_1990.append(float(result))
        
    if word in full_model_2000.keys():
        genocide = numpy.array(full_model_2000['genocide']).reshape(1,-1)
        word_vector = numpy.array(full_model_2000[word]).reshape(1,-1)
        result = cosine_similarity(genocide, word_vector)
        results_2000.append(float(result))  
        
df = pd.DataFrame(list(zip(target_terms, results_1960, results_1990_ae, results_1990, results_2000)),
               columns =['term', '1960', "1990_ae", "1990", "2000"])
print(df)
        



              term      1960   1990_ae      1990      2000
0        holocaust  0.112165  0.438878  0.683783  0.581248
1            nsdap  0.129859  0.132378  0.000000  0.000000
2      deportation  0.164166  0.359414  0.675163  0.417379
3           crimes  0.418743  0.423502  0.695356  0.539210
4        communism  0.027169  0.194285  0.000000  0.000000
5      colonialism  0.086685  0.311652  0.000000  0.000000
6        nuremberg  0.222773  0.261253  0.659730  0.573779
7             nazi  0.128535  0.332928  0.534331  0.407785
8              jew  0.114908  0.150941  0.656605  0.576203
9           hitler  0.153216  0.265693  0.000000  0.389353
10              ss  0.146568  0.068803  0.000000  0.000000
11        eichmann  0.195831  0.072406  0.000000  0.000000
12         gestapo  0.214036  0.199444  0.000000  0.000000
13          nazism  0.050930  0.267095  0.000000  0.000000
14         fascism  0.094527  0.281655  0.703063  0.528234
15          pogrom  0.000000  0.239271  0.657841  0.0000

In [15]:
#code used for statistical tests, 
from scipy import stats

df[["1990", "2000"]].describe()

# stats.shapiro(df['2000'])
# stats.shapiro(df['1990'])
stats.ttest_rel(df['1990'], df['2000'])

Ttest_relResult(statistic=1.8110802419370575, pvalue=0.0838060718131758)