In [1]:
import numpy as np
import pickle
import os
import pandas as pd

EMBED_IDXS = range(1,300)

BASEPATH = "../resources/word_embeddings"
WORD_EMBEDDINGS_FILENAME_TEMPLATE = os.path.join(BASEPATH, "{}-w.npy")
VOCAB_FILENAME_TEMPLATE = os.path.join(BASEPATH, "{}-vocab.pkl")
startYear = 1800

Load embeddings and vocabularies into a dictionary in the form of {'1800' : (embeddings, vocab), '1810': ...} :

In [2]:
hist_embeddings = {}

for year in [str(year) for year in range(startYear, 1991, 10)]:
    word_embeddings_filename = WORD_EMBEDDINGS_FILENAME_TEMPLATE.format(year)
    vocab_filename = VOCAB_FILENAME_TEMPLATE.format(year)
    df = pd.DataFrame(np.load(word_embeddings_filename) )
    df['word'] = pickle.load(open(vocab_filename, "rb"))
    
    #reorder so "word" is the first column
    cols = df.columns.tolist()
    cols = [cols[-1]] + cols[:-1]
    df = df[cols]
    hist_embeddings[year] = df

Validate that the sets of vocabularies are equal for all decades:

In [3]:
dicts = [set(p["word"]) for p in hist_embeddings.values()]
bools = []
for indivDict in dicts:
    bools.append(set(indivDict) == set(dicts[0]))
        
all(bools)

True

Inspecting the embeddings manually for certain decades shows that the embeddings of many words appear to consist entirely of zeros. This is probably because these words were discarded due to being infrequent: *"During model learning we also discarded all words within a year that occurred below a certain threshold (500 for the Google data, ...)"*

Counting the nonzero rows for each decade shows that the number of nonzero rows decreases with the decades i.e there are ~70k such rows for 1990 and ~13k for 1800.  

In [4]:
nonzero_wordcounts = {}

for year in hist_embeddings.keys():
    emb = hist_embeddings[year][EMBED_IDXS]
    row_sums = np.sum(emb, axis=1)
    num_nonzero_rows = np.count_nonzero(row_sums != 0)
    nonzero_wordcounts[year] = num_nonzero_rows
    
nonzero_wordcounts

{'1800': 13045,
 '1810': 15771,
 '1820': 20312,
 '1830': 21691,
 '1840': 23818,
 '1850': 29035,
 '1860': 27191,
 '1870': 29320,
 '1880': 34081,
 '1890': 37729,
 '1900': 41551,
 '1910': 36553,
 '1920': 35643,
 '1930': 34477,
 '1940': 34226,
 '1950': 41807,
 '1960': 54332,
 '1970': 60344,
 '1980': 64934,
 '1990': 71097}

The fact that for earlier time periods, embeddings for many words are missing, poses the question of how many words there are that have word-embeddings available for all decades. Fortunatelly, it seems that most words available in 1800 are also available in all other decades (12748 out of 13045) :

In [5]:
#build vocab of words that are non-zero over all decades
vocabs_decades = {}
for year in hist_embeddings.keys():  
    nonzero_indices_startYear = np.where(np.sum(hist_embeddings[year][EMBED_IDXS], axis=1) != 0)[0]
    nonzero_vocab_startYear = [hist_embeddings[year]["word"][i] for i in nonzero_indices_startYear]
    vocabs_decades[year] = set(nonzero_vocab_startYear)
    
vocab_fully_available = set.intersection(*vocabs_decades.values())

#filter and keep only words in the combined vocab
for year in hist_embeddings.keys():    
    hist_embeddings[year] = hist_embeddings[year].loc[hist_embeddings[year]["word"].isin(vocab_fully_available)]
    hist_embeddings[year] = hist_embeddings[year].sort_values(by = "word")

#check if everything worked
dicts = [set(p["word"]) for p in hist_embeddings.values()]
bools = []
for indivDict in dicts:
    bools.append(set(indivDict) == set(dicts[0]))
        
print("All decades share same dictionary:" , all(bools))
print("Size of that shared dictionary:", len(hist_embeddings['1800']))

All decades share same dictionary: True
Size of that shared dictionary: 12748


Saving the dictionary containing only the fully-available years starting from startYear to disk for later use:

In [6]:
outpath = os.path.join(BASEPATH, ('histWord_fullAvail_' + str(startYear) + '.pickle'))
with open(outpath, 'wb') as f:
    pickle.dump(hist_embeddings, f)