In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim import downloader
from scipy.spatial.distance import cosine
import string
import operator
from pathlib import Path

In [None]:
from utils import explore_preds

In [None]:
cosine_similarity = lambda x, target : 1 - cosine(x,target)

# Read a word2vec model
w2v_model_past = Word2Vec.load("models/word2vec/w2v_1760_1900/w2v_words.model")

In [None]:
dWordClusters = dict()

dWordClusters["woman"] = ["woman", "women"]
dWordClusters["girl"] = ["boy", "boys"]
dWordClusters["girls"] = ["girl", "girls"]
dWordClusters["slave"] = ["slave", "slaves"]
dWordClusters["machine"] = ["machine", "machines"]
dWordClusters["artisan"] = ["artisan", "artisans"]

In [None]:
epoch = "1760_1900"
keyword = "slave"

w2v_model = w2v_model_past
if epoch == "contemporary":
    w2v_model = w2v_model_cont
    
Path("experiments/").mkdir(parents=True, exist_ok=True)

In [None]:
dAvgVectors = dict()
for c in dWordClusters:
    dAvgVectors[c] = explore_preds.w2v_avg_embedding(dWordClusters[c], w2v_model)

In [None]:
for corpus in ["JSA", "RSC", "HMD", "BLB"]:
    
    print(corpus)
    
    df = pd.read_pickle("data/" + corpus.lower() + "_processed/" + corpus + "_" + keyword + "_synparsed_pred_bert.pkl")
    dWordEmb = dict() # Dictionary where we keep embedding-cluster similarities, so we don't need to not find it every time

    sim_df = pd.DataFrame(columns=["df_id"] + [x + "_" + epoch for x in list(dAvgVectors.keys())])
    rows_list = []

    for i, row in df.iterrows():
        preds_test = row["pred_bert_" + epoch]
        dClScores = dict()
        for cl in dAvgVectors:
            aggr_values = []
            for pred in preds_test:
                if "#" in pred[0] and cl == "punctuation": # If it is a BERT subword, consider it as punctuation:
                    aggr_values.append(pred[1] + 0.0001)
                elif pred[0] in dWordClusters[cl]:
                    aggr_values.append(pred[1] + 0.0001) # To smooth rounded zero values
                elif dWordEmb.get(pred[0], {}).get(cl):
                    aggr_values.append(dWordEmb[pred[0]][cl] * (pred[1] + 0.0001))
                else:
                    try:
                        tmp_emb = w2v_model.wv.get_vector(pred[0])
                        clusterSim = cosine_similarity(tmp_emb, dAvgVectors[cl])
                        aggr_values.append(clusterSim * (pred[1] + 0.0001))
                        if pred[0] in dWordEmb:
                            dWordEmb[pred[0]][cl] = clusterSim
                        else:
                            dWordEmb[pred[0]] = {cl:clusterSim}
                    except KeyError:
                        pass

            dClScores[cl + "_" + epoch] = round(sum(aggr_values), 3)
        dClScores["df_id"] = i
        rows_list.append(dClScores)

    sim_df = pd.DataFrame(rows_list)

    df = df.merge(sim_df, left_index=True, right_on="df_id")

    df = df.drop(columns=["df_id"])
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    df.to_csv("experiments/" + keyword + "_" + corpus.lower() + "_" + epoch + "_clusters.tsv", sep="\t")
    df.to_json("experiments/" + keyword + "_" + corpus.lower() + "_" + epoch + "_clusters.json")
    df.to_pickle("experiments/" + keyword + "_" + corpus.lower() + "_" + epoch + "_clusters.pkl")