In [None]:
import pandas as pd
import spacy
import numpy as np
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
import string
from nltk.corpus import stopwords
import operator

In [None]:
from utils import explore_preds

In [None]:
query = "slave"

In [None]:
cosine_similarity = lambda x, target : 1 - cosine(x,target)

# Read a word2vec model
w2v_model = Word2Vec.load("../../workspace/models/word2vec/w2v_1760_1900/w2v_words.model")

In [None]:
dWordClusters = dict()
dWordClusters["machine"] = ["machine", "machines", "engine", "engines"]
dWordClusters["boy"] = ["boy", "boys", "lad", "lads"]
dWordClusters["girl"] = ["girl", "girls"]
dWordClusters["slave"] = ["slave", "slaves", "slavery"]
dWordClusters["artisan"] = ["artisan", "artisans"]
dWordClusters["woman"] = ["woman", "women"]

In [None]:
dAvgVectors = dict()
for c in dWordClusters:
    dAvgVectors[c] = explore_preds.w2v_avg_embedding(dWordClusters[c], w2v_model)

In [None]:
for corpus in ["JSA", "RSC", "HMD", "BLB"]:
    epochs = ["1760_1900", "contemporary"]

    df = pd.read_pickle("predictions/" + corpus + "_" + query + "_synparsed_pred_bert.pkl")
    dWordEmb = dict() # Dictionary where we keep embedding-cluster similarities, so we don't need to not find it every time

    for epoch in epochs:

        print(corpus, epoch)

        sim_df = pd.DataFrame(columns=["df_id"] + [x + "_" + epoch for x in list(dAvgVectors.keys())])
        rows_list = []

        for i, row in df.iterrows():
            preds_test = row["pred_bert_" + epoch]
            dClScores = dict()
            for cl in dAvgVectors:
                aggr_values = []
                for pred in preds_test:
                    if "#" in pred[0] and cl == "punctuation": # If it is a BERT subword, consider it as punctuation:
                        aggr_values.append(pred[1] + 0.0001)
                    elif pred[0] in dWordClusters[cl]:
                        aggr_values.append(pred[1] + 0.0001) # To smooth rounded zero values
                    elif dWordEmb.get(pred[0], {}).get(cl):
                        aggr_values.append(dWordEmb[pred[0]][cl] * (pred[1] + 0.0001))
                    else:
                        try:
                            tmp_emb = w2v_model.wv.get_vector(pred[0])
                            clusterSim = cosine_similarity(tmp_emb, dAvgVectors[cl])
                            aggr_values.append(clusterSim * (pred[1] + 0.0001))
                            if pred[0] in dWordEmb:
                                dWordEmb[pred[0]][cl] = clusterSim
                            else:
                                dWordEmb[pred[0]] = {cl:clusterSim}
                        except KeyError:
                            pass

                dClScores[cl + "_" + epoch] = round(sum(aggr_values), 3)
            dClScores["df_id"] = i
            rows_list.append(dClScores)

        sim_df = pd.DataFrame(rows_list)

        df = df.merge(sim_df, left_index=True, right_on="df_id")

        df = df.drop(columns=["df_id"])
        if "Unnamed: 0" in df.columns:
            df = df.drop(columns=["Unnamed: 0"])

    df.to_csv("experiments/tsv_format/" + corpus.lower() + "_" + query + "_clusters.tsv", sep="\t")
    df.to_json("experiments/json_format/" + corpus.lower() + "_" + query + "_clusters.json")
    print()