In [1]:
import pandas as pd
import gensim
import pickle



In [2]:
def get_clean_data(path):
    """Read a json file into Pandas DataFrame. Clean the DF by changing every cell from a dictionary to a number or string.
    
    Parameters
    ----------
    path : string
        Location of the Wordnik json file
    
    Returns
    -------
    df : Pandas DataFrame
        Cleaned DataFramed of table
    """
    df = pd.read_json(path)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: list(x.values())[0] if type(x) == dict else x)
    return df

def intialize_listwords_listvecs(from_existing=True):
    if from_existing:
        with open('wordlists.pickle', 'rb') as handle:
            w = pickle.load(handle)
        with open('listvecs.pickle', 'rb') as handle:
            u = pickle._Unpickler(handle)
            u.encoding = 'latin1'
            v = u.load()
    else:
        w = {}
        v = {}
    return w, v

def update_listwords(w):
    listed_words = get_clean_data('new_valid_list_data/valid_listed_words')
    for l in [i for i in listed_words.wordListId.unique() if i not in w]:
        w[l] = listed_words[listed_words.wordListId==l].lcword.values
    return w

def update_listvecs(w, v, model):
    for i in [i for i in w if i not in v]:
        lst = [word for word in w[i] if word in model.wv.vocab]
        if len(lst)>0:
            v[i] = model.wv[lst].mean(axis=0)
    return v

def create_pickles(model):
    w, v = intialize_listwords_listvecs()
    w = update_listwords(w)
    v = update_listvecs(w, v, model)
    with open('wordlists.pickle', 'wb') as handle:
        pickle.dump(w, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('listvecs.pickle', 'wb') as handle:
        pickle.dump(v, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('./model/wiki.en/wiki.en.vec', binary=False)

In [4]:
create_pickles(model)

