In [15]:
import numpy as np
import pickle
import os
import sklearn
from sklearn.linear_model import Ridge
from scipy.spatial.distance import pdist, squareform
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import diptest
from scipy.stats import pearsonr
from scipy.stats import gaussian_kde

NEIGHBORHOOD_SIZE = 30
REGRESSION_MODEL_FILENAME = "../regression_model.pkl"
BASEPATH_EMBEDDINGS = "../resources/word_embeddings/"
FULLDATE_FILENAME = "histWord_fullAvail_1800.pickle"
HISTORICAL_GOLDLABELS_PATH = "../resources/historical_gold_lexicon/goldEN.vad"

#generated with ChatGPT
def calculate_derivative(data, dx=1):
    derivative = []
    n = len(data)
    for i in range(1, n - 1):
        derivative_value = (data[i+1] - data[i-1]) / (2*dx)
        derivative.append(derivative_value)
    return derivative

Load the word embeddings and regression model and predict valence-scores:

In [16]:
with open(os.path.join(REGRESSION_MODEL_FILENAME), 'rb') as f:
    model = pickle.load(f)
    
with open(os.path.join(BASEPATH_EMBEDDINGS, FULLDATE_FILENAME), 'rb') as f:
    data = pickle.load(f)
    
#data.append({})
for year in data.keys():
    embeds = data[year].iloc[:, 1:]
    preds = model.predict(embeds)
    data[year]['pred'] = preds

Evaluate predictions on the historical lexicon provided by Buechel et al. (https://github.com/JULIELab/HistEmo) . Buechel et al. asked their historical language experts to label words from the perspective of someone living in the 1830s, therefore their labels will be compared against the 1830-data

In [17]:
colnames = ["word", "valence_goldlabel"]
historical_goldlabels = pd.read_csv(HISTORICAL_GOLDLABELS_PATH, sep = "\t", header = None, usecols = [0,1], names=colnames)

predicted_goldlabels = pd.DataFrame({'word': list(data['1830']['word']), 'valence_predicted': data["1830"]["pred"]})
merged_df = pd.merge(historical_goldlabels, predicted_goldlabels, on='word')
merged_df

Unnamed: 0,word,valence_goldlabel,valence_predicted
0,deal,5.5,5.350107
1,study,5.5,6.121803
2,afford,2.5,6.279172
3,service,6.5,5.875949
4,height,5.0,5.066674
...,...,...,...
95,walk,6.0,5.845415
96,difference,5.0,5.176209
97,hang,5.0,4.983013
98,following,4.0,5.809124


In [18]:
merged_df['valence_goldlabel'].mean()

4.885

In [19]:
corr_coeff, p_value = pearsonr(merged_df['valence_goldlabel'], merged_df['valence_predicted'])
print("Pearson's r of model: " + str(corr_coeff))

Pearson's r of model: 0.5061797783040405


Find k nearest neighbors in the embedding-space for each word in each decade

In [21]:
closestNeighborsIdxs = {}

for year in data.keys():
    distances_between_vectors = pdist(data[year].iloc[:, 1:], metric="cosine")
    square_distance_matrix = squareform(distances_between_vectors)
    x = np.argsort(square_distance_matrix, axis=1)[:,:NEIGHBORHOOD_SIZE].copy()
    closestNeighborsIdxs[year] = x
    distances_between_vectors, square_distance_matrix = [],[]

Calculate Hartigan's D for each word for each decade and also approximate and sum its derivative

In [22]:
res = {}

for inspected_word in data['2012']['word']:
    res[inspected_word] = [None, {}]
    ps = []
    for inspected_year in list(data.keys()):       
        inspected_word_idx = list(data[inspected_year]['word']).index(inspected_word)
        inspected_word_neighbord_idxs = closestNeighborsIdxs[inspected_year][inspected_word_idx]
        x = data[inspected_year].iloc[list(inspected_word_neighbord_idxs), 301]
        dip, pval = diptest.diptest(x)  
        res[inspected_word][1][inspected_year] = (np.array(x), np.array(inspected_word_neighbord_idxs), pval)
        ps.append(dip)
    
    derivative = np.abs(calculate_derivative(ps))
    area_under_curve = sum(derivative)
    res[inspected_word][0] = area_under_curve
    
sorted_dict = dict(sorted(res.items(), key=lambda x: x[1][0], reverse=True))
top_words = list(sorted_dict.keys())

In [23]:
#indices and predicted valence of words are required for result object, word-embeddings can be discared
data_min = {}
for year in data.keys():
    data_min[year] = data[year].iloc[:, [0,301]]

#Save result object for later exploration
with open("../results_k_{}.pkl".format(NEIGHBORHOOD_SIZE), 'wb') as f:
    pickle.dump([closestNeighborsIdxs, top_words, data_min], f)