# Word Recommenders Based on Word2Vec and FastText Word Embeddings

In [1]:
import pandas as pd
import numpy as np
import scipy
import gensim
import pickle
import json
from nltk.stem.porter import *
stemmer = PorterStemmer()



## Importing pretrained models

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format('./model/wiki.en/wiki.en.vec', binary=False)
# model2 = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

In [70]:
def get_clean_data(path):
    df = pd.read_json(path)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: list(x.values())[0] if type(x) == dict else x)
    return df
    
def get_json(path):
    path2 = 'search_counts.json'
    with open(path2, 'rb') as f:
        data = json.load(f)
    return data

def get_pickle(path):
    with open(path, 'rb') as handle:
        pic = pickle.load(handle, encoding='latin1')
    return pic

def check_coverage(all_words, mod):
    unique_words = set(all_words)
    print('Model dictionary size: {}'.format(len(mod.vocab)))
    print('Count of unique listed words: {}'.format(len(unique_words)))
    print('Intersection of the above two: {}'.format(len([w for w in unique_words if w in mod.vocab])))
    print('% of listed words that is in the dictionary: {}'.format(1.0 * len([w for w in unique_words if w in mod.vocab]) / len(unique_words)))
    print('Listings of words (non-unique) that are on our model vocabulary: {}'.format(1.0*len([w for w in all_words if w in mod.vocab])/len(all_words)))
    
def find_words(input_words, mod, n_outputs=1):
    return [w[0] for w in mod.most_similar([w for w in input_words if w in mod.vocab],topn=n_outputs) if w[0].isalpha() and w[0] not in input_words][:n_outputs]

def find_rank(lst, dic):
    lsty = [w for w in lst if w in dic]
    lstn = [w for w in lst if w not in dic]
    df = pd.DataFrame([dic[w] for w in lsty], index = lsty, columns = ['cnt'])
    df = pd.concat([df,pd.DataFrame([0]*len(lstn), index = lstn, columns = ['cnt'])])
    df['rank'] = df.cnt.rank(ascending=False)
    return df

def filter_words(input_, lst):
    return [w for w in lst if stemmer.stem(w) not in [stemmer.stem(i) for i in input_]]

def find_filter_rank_words(input_, dic1, dic2, dic3, mod, n_outputs_ = 5, multiple = 5):
    lst_ = find_words(input_, mod, n_outputs = n_outputs_*multiple)
    lst = filter_words(input_, lst_)
    rank_l = find_rank(lst, dic = dic1)
    rank_s = find_rank(lst, dic = dic2)
    rank_t = find_rank(lst, dic = dic3)
    df = pd.concat([rank_l, rank_s, rank_t], axis=1).reset_index()
    df['rank_combine'] = df['rank'].sum(axis=1)
    df['stem'] = df['index'].apply(lambda x: stemmer.stem(x))
    df.drop_duplicates(['stem'],inplace=True)
    return df.sort_values('rank_combine').iloc[:n_outputs_]['index'].values

def predict_(s, mod):
    try:
        return mod[s]
    except:
        return np.zeros(300)
    
def averaging(arrays):
    arrays_ = pd.Series(arrays).apply(lambda x: np.reshape(x, (-1, 1)))
    return np.mean(np.concatenate(arrays_.values, axis=1), axis=1)

def find_lists(test, list_vecs, listed_words, word_lists, mod, n_outputs=3):
    test_arrays = pd.Series(test).apply(lambda x: predict_(x, mod))
    test_avg = averaging(test_arrays)
    dists = {}
    for wordlist in list_vecs:
        dist = 1 - scipy.spatial.distance.cosine(test_avg, list_vecs[wordlist])
        if dist >= 0 and dist <= 1:
            dists[dist] = wordlist
    ds = sorted(dists.keys(), reverse=True)[:n_outputs]
    lists = [dists[d] for d in ds]
    for bestlist in lists:
        list_name = word_lists[word_lists._id==bestlist].name
        creator = word_lists[word_lists._id==bestlist].createdBy
        listed_words_ = listed_words[listed_words.wordListId==bestlist].lcword.unique()
        print('\n')
        print('List name: {}'.format(list_name.iloc[0]))
        print('Creator: {}'.format(creator.iloc[0]))
        print('Words in list: {}'.format(listed_words_))

In [44]:
def get_pickle(path):
    with open(path, 'rb') as handle:
        pic = pickle.load(handle, encoding='latin1')
    return pic

In [59]:
# listed_words = get_clean_data('new_valid_list_data/valid_listed_words')
# word_lists = get_clean_data('new_valid_list_data/valid_list_metadata')
# list_vecs = get_pickle('listvecs.pickle')
# pop_search = get_json('search_counts.json')
# all_words = listed_words.lcword.values
pop_listed = dict(listed_words.lcword.value_counts())
pop_search = get_json('word_cnts.json')
pop_tag = get_json('word_numtags_map.json')

## EDA: Checking vocubulary coverage

In [49]:
check_coverage(all_words, mod = model)

Model dictionary size: 2519370
Count of unique listed words: 484355
Intersection of the above two: 151439
% of listed words that is in the dictionary: 0.31266116794499904
Listings of words (non-unique) that are on our model vocabulary: 0.698293419856025


## Recommending New Words

In [50]:
test = ['delta','analytics','data','analysis','model','analyze','statistics','database']

In [66]:
find_filter_rank_words(test, pop_listed, pop_search, pop_tag, mod = model, n_outputs_ = 10, multiple = 5)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




array(['quantitative', 'qualitative', 'visualization', 'methodologies',
       'dataset', 'benchmarking', 'predictive', 'analyses',
       'datastructure', 'geostatistics'], dtype=object)

In [67]:
find_filter_rank_words(test, pop_listed, pop_search, pop_tag, mod = model, n_outputs_ = 10, multiple = 10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




array(['quantitative', 'heuristics', 'qualitative', 'metrics',
       'visualization', 'methodologies', 'biometrics', 'multivariate',
       'forecasting', 'computational'], dtype=object)

## Recommending Existing Lists

In [71]:
find_lists(test, list_vecs, listed_words, word_lists, model, n_outputs=3)



List name: Work Words
Creator: mandyshea
Words in list: ['analysis' 'analyst' 'analytics' 'analyze' 'verbiage']


List name: Words of the future
Creator: beohbe
Words in list: ['portfolio' 'data' 'research' 'profile' 'grammar challenge']


List name: SCIE - mathematics
Creator: gulyasrobi
Words in list: ['nonparametric' 'nonparametric statistics' 'multivariate analysis'
 'partial differential equation' 'multivariate' 'topology' 'stochastic'
 'differential equation' 'linear algebra' 'harmonic analysis'
 'applied mathematics' 'combinatorial' 'nonlinear' 'computational'
 'set theory' 'linear programming' 'parametric' 'numerical analysis'
 'group theory' 'statistical method' 'asymptotic' 'mathematical logic'
 'discrete' 'probabilistic' 'boolean' 'estimator' 'differential' 'fractal'
 'symmetric' 'markov' 'simulation' 'mathematics' 'graphical'
 'mathematical' 'extremum' 'algebraic' 'statistical' 'equation'
 'algorithm' 'geometric' 'fermat' 'cosine' 'actuary' 'linear' 'randomness'
 'analysi