In [None]:
import pandas as pd
import spacy
import numpy as np
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
import string
from nltk.corpus import stopwords
import operator

In [None]:
from utils import explore_preds

In [None]:
cosine_similarity = lambda x, target : 1 - cosine(x,target)

# Read a word2vec model
w2v_model = Word2Vec.load("../../workspace/models/word2vec/w2v_1760_1850/w2v_words.model")

In [None]:
dWordClusters = dict()

dWordClusters["stopwords"] = stopwords.words('english')
dWordClusters["punctuation"] = [x for x in string.punctuation]
dWordClusters["machines"] = ["instrument", "apparatus", "machine", "engine",
                             "engines", "instruments", "tube", "cylinder",
                             "machinery", "machines", "turbine", "turbines",
                             "boiler", "boilers", "dynamo", "dynamos", "motor",
                             "motors", "apparatuses", "accumulator", "accumulators",
                             "compressor", "compressors", "piston", "valve", "pump",
                             "pistons", "valves", "pumps"]
dWordClusters["transport"] = ["car", "cars", "cart", "carts", "boat",
                              "boats", "ship", "ships", "carriage", "carriages", "train", 
                              "trains", "wagon", "wagons", "omnibus", "omnibuses", "ambulance",
                              "ambulances", "steamer", "steamers", "locomotive", "locomotives",
                              "vehicle", "vehicles"]
dWordClusters["workforce"] = ["worker", "workers", "manufacturer", "manufacturers",
                              "slave", "slaves", "servant", "servants", "labourer", 
                              "labourers", "workman", "workmen", "workwoman", "workwomen",
                              "employee", "employees", "craftsman", "craftsmen", "craftswoman",
                              "craftswomen", "tradesman", "tradesmen", "tradeswoman",
                              "tradeswomen", "shopkeeker", "shopkeekers", "merchant", 
                              "merchants", "dealer", "dealers", "trader", "traders",
                              "tradesperson", "tradespersons", "artisan", "technician",
                              "artisans", "technicians", "maker", "makers", "driver",
                              "drivers", "child", "boy", "girl", "children", "boys",
                              "girls", "lad", "man", "men", "woman", "women",
                              "experts", "person", "persons", "people"]
dWordClusters["energy"] = ["energy", "power", "force", "motion", "electricity", "spark",
                           "light", "flame", "powers", "heat", "gas"]
dWordClusters["upperclass"] = ["gentleman", "gentlemen", "king", "kings",
                               "prince", "princes", "lady", "ladies",
                               "princess", "princesses", "queen", "queens",
                               "aristocrat", "aristocrats"]
dWordClusters["child"] = ["child", "boy", "girl", "children", "youth", "boys",
                          "girls", "lad", "infant", "baby"]
dWordClusters["workanimal"] = ["animal", "animals", "dog", "dogs", "ass", "asses",
                               "mule", "mules", "donkey", "donkeys", "beast", "beasts",
                               "cow", "cows", "ox", "oxen", "pony", "ponies", "creature",
                               "creatures", "bird", "birds", "cat", "cats", "horse",
                               "horses", "sheep", "goat", "goats"]
dWordClusters["spiritual"] = ["soul", "souls", "spirit", "spirits", "life", "lives",
                              "idea", "ideas", "art", "arts", "god", "gods", "thought",
                              "thoughts", "mind", "minds", "heart", "hearts", "angel",
                              "angels", "demon", "demons", "virtue", "virtues",
                              "brain", "brains", "instinct", "instincts"]
dWordClusters["body"] = ["body", "bodies", "organism", "tissue", "skeleton",
                         "frame", "skeletons", "organisms", "tissues", "frames",
                         "flesh", "muscle", "muscles"]
dWordClusters["system"] = ["system", "systems", "plan", "plans", "scheme", "schemes",
                           "infrastructure"]
dWordClusters["nature"] = ["air", "water", "flame", "earth", "sun", "planet",
                           "planets", "world", "wind", "storm", "weather"]
dWordClusters["slave"] = ["slave", "slaves", "slavery"]
dWordClusters["female"] = ["woman", "women", "girl", "girls"]

In [None]:
dAvgVectors = dict()
for c in dWordClusters:
    dAvgVectors[c] = explore_preds.w2v_avg_embedding(dWordClusters[c], w2v_model)

In [None]:
for corpus in ["JSA", "RSC", "HMD", "BLB"]:
    epochs = ["1760_1850", "1890_1900"]

    df = pd.read_pickle("data/" + corpus.lower() + "_processed/" + corpus + "_machine_synparsed_pred_bert.pkl")
    dWordEmb = dict() # Dictionary where we keep embedding-cluster similarities, so we don't need to not find it every time

    for epoch in epochs:

        print(epoch)

        sim_df = pd.DataFrame(columns=["df_id"] + [x + "_" + epoch for x in list(dAvgVectors.keys())])
        rows_list = []

        for i, row in df.iterrows():
            preds_test = row["pred_bert_" + epoch]
            dClScores = dict()
            for cl in dAvgVectors:
                aggr_values = []
                for pred in preds_test:
                    if "#" in pred[0] and cl == "punctuation": # If it is a BERT subword, consider it as punctuation:
                        aggr_values.append(pred[1] + 0.0001)
                    elif pred[0] in dWordClusters[cl]:
                        aggr_values.append(pred[1] + 0.0001) # To smooth rounded zero values
                    elif dWordEmb.get(pred[0], {}).get(cl):
                        aggr_values.append(dWordEmb[pred[0]][cl] * (pred[1] + 0.0001))
                    else:
                        try:
                            tmp_emb = w2v_model.wv.get_vector(pred[0])
                            clusterSim = cosine_similarity(tmp_emb, dAvgVectors[cl])
                            aggr_values.append(clusterSim * (pred[1] + 0.0001))
                            if pred[0] in dWordEmb:
                                dWordEmb[pred[0]][cl] = clusterSim
                            else:
                                dWordEmb[pred[0]] = {cl:clusterSim}
                        except KeyError:
                            pass

                dClScores[cl + "_" + epoch] = round(sum(aggr_values), 3)
            dClScores["df_id"] = i
            rows_list.append(dClScores)

        sim_df = pd.DataFrame(rows_list)

        df = df.merge(sim_df, left_index=True, right_on="df_id")

        df = df.drop(columns=["df_id"])
        if "Unnamed: 0" in df.columns:
            df = df.drop(columns=["Unnamed: 0"])

    df.to_csv("experiments/" + corpus.lower() + "_clusters.tsv", sep="\t")
    df.to_json("experiments/" + corpus.lower() + "_clusters.json")
    df.to_pickle("experiments/" + corpus.lower() + "_clusters.pkl")