In [1]:
import numpy as np
import pickle
import pandas as pd
import os
import sklearn
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import csv
import sys

BASEPATH_RESOURCES = "../resources/"
BASEPATH_EMBEDDINGS = os.path.join(BASEPATH_RESOURCES, "word_embeddings/")
BASEPATH_XANEW = os.path.join(BASEPATH_RESOURCES, "XANEW_lexicon/")
WORD_EMBEDDINGS_FILENAME_TEMPLATE = os.path.join(BASEPATH_EMBEDDINGS, "{}-w.npy")
VOCAB_FILENAME_TEMPLATE = os.path.join(BASEPATH_EMBEDDINGS, "{}-vocab.pkl")
FULLY_AVAILABLE_FILENAME = "histWord_fullAvail_1800.pickle"

In [2]:
#For accessing modules from parent folder from jupyter notebook
sys.path.append(os.path.dirname(os.getcwd()))

Secondary distribution of the Warringer et al. XANEW valence-lexicon provided by Buchel el al.: https://github.com/JULIELab/XANEW . We are only interested in the valence dimension of the provided VAD-model:

In [3]:
xanew_csv_location = os.path.join(BASEPATH_XANEW, 'Ratings_Warriner_et_al.csv')
df_xanew = pd.read_csv(xanew_csv_location, index_col=0)
df_xanew=df_xanew[['Word','V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
df_xanew.columns=['word', 'valence', 'arousal', 'dominance']
df_xanew.set_index('word',inplace=True)
df_xanew = df_xanew['valence']
df_xanew = df_xanew[0:]
df_xanew

word
aardvark       6.26
abalone        5.30
abandon        2.84
abandonment    2.63
abbey          5.85
               ... 
zone           4.75
zoning         4.65
zoo            7.00
zoom           5.86
zucchini       6.30
Name: valence, Length: 13915, dtype: float64

For training a regression model that predicts valence-scores based on word embeddings, contemporary word-embeddings and the (contemporary) XANEW lexicon are used. The contemporary word-ebmeddings provided by Kozlowski et al. (https://github.com/KnowledgeLab/GeometryofCulture) are trained on the Google Books Ngram Dataset (like the historical word-embeddings) and use data from 2000-2012. As the original ANEW lexicon was published in 1999 and XANEW was published in 2013, this seems an appropriate time period.

The contemporary word-embeddings come in a 5gb file, so in order to save RAM they are read chunk-wise and only those words that are present in the subset of fully available words in the historical embeddings (meaning words that are available from a given start-year and throughout all following decades) OR that are present in the XANEW dataset are kept. 

In [4]:
embeds2012filename = os.path.join(BASEPATH_EMBEDDINGS, "US_Ngrams_2000_12.csv")
vocab_xanew_set = set(df_xanew.index)

with open(os.path.join(BASEPATH_EMBEDDINGS, FULLY_AVAILABLE_FILENAME), 'rb') as f:
    data_fully_available = pickle.load(f)
vocab_fully_available = set(data_fully_available['1990']["word"]) #doesn't actually matter which year as long as it is present

#filtered_xanew_vocab = set(df_xanew.index[df_xanew.index.isin(vocab_fully_available)])

chunks = []
condition = lambda x: x.iloc[0] in vocab_xanew_set or x.iloc[0] in vocab_fully_available 
for chunk in pd.read_csv(embeds2012filename, chunksize=10000):
    filtered_chunk = chunk[chunk.apply(condition, axis=1)]
    chunks.append(filtered_chunk)

embeds2012df = pd.concat(chunks, ignore_index=True)
embeds2012df.rename(columns={embeds2012df.columns[0]: 'word'}, inplace=True)
embeds2012df

Unnamed: 0,word,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300
0,the,0.246731,0.363338,0.060991,-0.198817,0.310261,0.009408,-0.295411,-0.006240,0.328381,...,-0.056671,0.145127,0.231035,0.068239,-0.191126,0.050652,0.140522,-0.099371,0.229379,-0.112136
1,of,0.093149,0.418176,-0.116714,0.270946,0.204331,0.245896,-0.118507,0.086045,0.247306,...,0.175549,-0.097769,0.478173,0.203049,-0.128923,-0.112181,0.162362,-0.268262,0.136539,-0.264232
2,to,0.471927,0.205140,-0.224204,-0.848070,-0.182479,0.067136,-0.584827,-0.059551,0.181575,...,-0.100134,0.343926,0.225350,0.153648,-0.031483,-0.382292,0.358430,-0.526325,-0.065493,-0.254600
3,and,-0.019393,0.107880,-0.075585,-0.377154,-0.229693,-0.107939,-0.318801,-0.612747,0.340766,...,0.020978,0.322502,0.196349,-0.151370,0.021184,-0.182635,-0.104195,-0.173470,-0.165159,-0.096662
4,a,0.212165,0.260075,0.237165,-0.006635,0.019352,-0.182506,-0.045717,-0.039354,0.214503,...,-0.288301,0.241163,0.217152,-0.055507,-0.274351,0.356617,0.084031,-0.269011,0.041584,0.096775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21188,cockamamie,0.233569,0.091536,-0.056433,-0.228170,-0.163259,0.042050,0.098792,-0.089111,-0.089200,...,-0.049771,0.180473,0.241582,-0.025437,-0.043793,-0.096972,0.117947,-0.174045,-0.061766,-0.068300
21189,amaretto,0.158590,0.048587,-0.067799,-0.153656,-0.186635,-0.043126,-0.172249,-0.137021,-0.056120,...,-0.002677,0.078793,0.086313,0.017446,0.032037,-0.136778,-0.041912,-0.044062,0.092137,0.038940
21190,dogface,0.159761,-0.002160,-0.031744,-0.120493,-0.108183,0.036673,-0.071997,-0.068909,-0.052637,...,0.073368,0.074484,0.051996,0.018723,0.007258,-0.034385,-0.039118,-0.010991,0.042650,0.060248
21191,applejack,0.136554,0.065858,-0.160253,-0.120877,-0.135318,0.040741,-0.027766,-0.119300,-0.066428,...,0.040380,0.116637,0.084052,0.005058,-0.088599,-0.083506,0.017294,-0.067914,-0.052102,0.008748


Saving the 2012 data to disk in the same format as the historical word-embeddings in order to be able to use the embedding-space-alignment code provided by Hamilton et al. . This will be used to align the 2012 embedding to the historical embeddings

In [5]:
with open(VOCAB_FILENAME_TEMPLATE.format("2012"), 'wb') as f:
    pickle.dump(list(embeds2012df["word"]), f)
    
with open(WORD_EMBEDDINGS_FILENAME_TEMPLATE.format("2012"), 'wb') as f:
    np.save(f, embeds2012df.iloc[:, 1:])

Aligning the 2012-embedding

In [6]:
#embedding-space-alignment from: 
#https://github.com/williamleif/histwords/blob/31e4d200310ebd4051776828eccb8b60c2120427/vecanalysis/seq_procrustes.py

from vecanalysis import alignment
from representations.representation_factory import create_representation
from ioutils import write_pickle, words_above_count, mkdir

embeds2012filename = os.path.join(BASEPATH_EMBEDDINGS, "US_Ngrams_2000_12.csv")

def align_years(years, rep_type, in_dir, out_dir, words, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print("Loading year:", year)
        year_embed = create_representation(rep_type, in_dir + str(year), **rep_args)
        year_words = words[str(year)]
        year_embed.get_subembed(year_words)
        print("Aligning year:", year)
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed)
        base_embed = aligned_embed
        print("Writing year:", year)
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy",aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
        
align_years([1990, 2012], 'word2vec', BASEPATH_EMBEDDINGS, BASEPATH_EMBEDDINGS + "aligned_", {'1990': vocab_fully_available, '2012': list(embeds2012df['word'])})


Loading year: 1990
Aligning year: 1990
Writing year: 1990
Loading year: 2012
Aligning year: 2012
Writing year: 2012


Deleting the old non-aligned 2012 embeddings:

In [7]:
embeds2012df, chunks = [], []

os.remove(WORD_EMBEDDINGS_FILENAME_TEMPLATE.format("2012"))
os.remove(VOCAB_FILENAME_TEMPLATE.format("2012"))

Now that the 2012 embeddings have been aligned, a regression model that predicts valence-scores from word-embeddings can be trained, using the 2012 embeddings and the XANEW lexicon.

In [8]:
vocab2012 = pickle.load(open(os.path.join(BASEPATH_EMBEDDINGS, "aligned_2012-vocab.pkl"), "rb"))
embeds2012 = np.load(open(os.path.join(BASEPATH_EMBEDDINGS, "aligned_2012-w.npy"),  "rb"))

embeds2012df = pd.DataFrame(np.column_stack((vocab2012, embeds2012)))
embeds2012df.rename(columns={embeds2012df.columns[0]: 'word'}, inplace=True)
embeds2012df.iloc[:, 1:] = embeds2012df.iloc[:, 1:].astype(float)


data = embeds2012df.merge(df_xanew, left_on=embeds2012df.columns[0], right_index=True, how = "left")
data = data.dropna()

#dataframe containig for each word the embeddings as well as the valence-score
data

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,300,valence
10,be,-0.073921,-0.017005,0.041815,-0.0618,-0.027994,0.062716,-0.049718,-0.015574,-0.016903,...,0.049053,0.056376,-0.01235,0.045981,0.049847,-0.025249,-0.057048,-0.072376,-0.020001,6.18
21,have,-0.085668,-0.017344,-0.060283,-0.05498,-0.007186,0.052359,-0.048267,-0.011891,0.014289,...,0.012468,0.139748,0.010565,0.014454,-0.050947,-0.043515,0.040321,0.020061,-0.02495,5.86
30,do,-0.026111,0.006054,-0.061897,-0.042794,-0.120985,-0.021435,0.045168,-0.020929,0.006951,...,-0.070293,0.152879,-0.038581,-0.06477,-0.029448,0.031269,0.026742,-0.019348,0.009928,5.41
31,one,0.017421,-0.107683,0.010804,-0.060565,-0.003685,-0.040135,-0.037717,-0.027218,0.032969,...,-0.00091,0.136521,0.018862,-0.072262,-0.034973,0.013554,-0.004832,-0.067237,0.028584,6.09
40,can,0.010438,-0.004003,0.012444,0.009934,-0.015006,-0.067744,0.025649,-0.029684,-0.010708,...,0.037644,0.06413,-0.007332,-0.006802,0.098844,0.054912,0.026327,-0.040598,0.028588,6.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21188,cockamamie,0.014528,0.040717,0.08493,-0.006598,-0.078333,0.054029,-0.027264,0.074691,-0.001976,...,0.071557,0.136786,0.034515,-0.115798,-0.085939,-0.038124,0.056204,0.005665,-0.072134,4.14
21189,amaretto,0.026179,0.078425,0.041433,-0.016075,-0.012127,-0.015286,-0.069652,0.026618,-0.032376,...,0.029965,0.105333,0.064314,-0.10273,0.008727,-0.133615,0.03053,-0.105485,-0.019015,6.00
21190,dogface,0.061255,0.111247,0.019108,-0.037107,-0.020661,-0.008131,-0.008643,0.033557,-0.002511,...,-0.021859,-0.000553,0.049622,-0.089446,0.028525,-0.132994,0.062472,-0.138655,-0.020023,3.95
21191,applejack,0.03337,0.103647,0.026528,-0.030663,-0.055398,0.038067,-0.024408,0.097765,0.014814,...,0.02476,0.080152,0.042838,-0.034143,-0.048138,-0.093605,0.101309,-0.036631,0.034687,5.79


In [9]:
df_xanew

word
aardvark       6.26
abalone        5.30
abandon        2.84
abandonment    2.63
abbey          5.85
               ... 
zone           4.75
zoning         4.65
zoo            7.00
zoom           5.86
zucchini       6.30
Name: valence, Length: 13915, dtype: float64

Performing train-test split to check if the model is working as intended:

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:301], data['valence'], test_size=0.2, random_state=42)
testwords = list(X_test['word'])
X_train = X_train.drop('word', axis = 1)
X_test = X_test.drop('word', axis = 1)

reg_traintest = Ridge()
reg_traintest.fit(X_train, y_train)
preds = reg_traintest.predict(X_test)
sklearn.metrics.mean_squared_error(y_test, preds, squared = False)



0.8786606352703527

In [11]:
with open("regression_model_trainset.pkl", 'wb') as f:
    pickle.dump(reg_traintest, f)
    
with open("testset_words.pkl", 'wb') as f:
    pickle.dump(testwords, f)

Training model on the full data and saving the result to disk for later use:

In [12]:
reg = Ridge(alpha = 100)
reg.fit(data.iloc[:,1:301].to_numpy(), data['valence'])

with open("../regression_model.pkl", 'wb') as f:
    pickle.dump(reg, f)

All that's left is taking only those words from the 2012 embeddings that are fully available (from the historical startYear throughout the decades) and appending it to the file that currently contains the fully available words + their embeddings

In [13]:
#keeping only words from 2012 embeddings that are also available for the other decades
embeds2012df = embeds2012df[embeds2012df['word'].isin(vocab_fully_available)]
embeds2012df = embeds2012df.sort_values(by = "word")

#keeping only words for other decades that are also available for 2012 embeddings
reduced_2012_wordset = set(embeds2012df["word"])

for year in data_fully_available.keys():    
    data_fully_available[year] = data_fully_available[year].loc[data_fully_available[year]["word"].isin(reduced_2012_wordset)]
    data_fully_available[year] = data_fully_available[year].sort_values(by = "word")

data_fully_available['2012'] = embeds2012df

with open(os.path.join(BASEPATH_RESOURCES, "fullAvalList.pkl"), 'wb') as f:
    pickle.dump(reduced_2012_wordset, f)

with open(os.path.join(BASEPATH_EMBEDDINGS, FULLY_AVAILABLE_FILENAME), 'wb') as f:
    pickle.dump(data_fully_available, f)

Checking if alignment was successful by calculating RSS between decades up to 2012: 

In [14]:
rss = 0

for year in data_fully_available.keys():
    matrix = data_fully_available[year].iloc[:, 1:]
    if year != list(data_fully_available.keys())[0]:
        previous_matrix = data_fully_available[previous_year].iloc[:, 1:]
        rss += np.sum(np.square(matrix.to_numpy() - previous_matrix.to_numpy()))
        print("RSS for {} - {} : {:.2f}".format(previous_year, year, rss))
    previous_year = year
    rss = 0

RSS for 1800 - 1810 : 7133.27
RSS for 1810 - 1820 : 8786.49
RSS for 1820 - 1830 : 9558.01
RSS for 1830 - 1840 : 9123.23
RSS for 1840 - 1850 : 8810.37
RSS for 1850 - 1860 : 8739.76
RSS for 1860 - 1870 : 8727.94
RSS for 1870 - 1880 : 8614.50
RSS for 1880 - 1890 : 8706.18
RSS for 1890 - 1900 : 8713.47
RSS for 1900 - 1910 : 9109.31
RSS for 1910 - 1920 : 9277.74
RSS for 1920 - 1930 : 9470.97
RSS for 1930 - 1940 : 9525.75
RSS for 1940 - 1950 : 9614.07
RSS for 1950 - 1960 : 9695.19
RSS for 1960 - 1970 : 10018.94
RSS for 1970 - 1980 : 10436.41
RSS for 1980 - 1990 : 10656.92
RSS for 1990 - 2012 : 11992.38
