# Word Recommenders Based on Word2Vec and FastText Word Embeddings

In [9]:
import pandas as pd
import numpy as np
import scipy
import gensim
import pickle
import json
from nltk.stem.porter import *
stemmer = PorterStemmer()

## Importing pretrained models

In [17]:
def get_clean_data(path):
    """Read a json file into pandas dataframe. Clean the df by changing every cell from a dictionary to a number or string.
    
    Parameters
    ----------
    path : string
        Location of the Wordnik json file
    
    Returns
    -------
    df : pandas dataframe
        Cleaned dataframe of table
    """
    df = pd.read_json(path)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: list(x.values())[0] if type(x) == dict else x)
    return df
    
def get_json(path):
    """Read a json file into a dictionary.
    
    Parameters
    ----------
    path : string
        Location of the json file
    
    Returns
    -------
    data : dictionary
        Table in a dictionary
    """
    with open(path, 'rb') as f:
        data = json.load(f)
    return data

def get_pickle(path):
    """Get table/dictionary from a pickled file
    
    Parameters
    ----------
    path : string
        Location of the pickled file
    
    Returns
    -------
    data : dictionary
        Table of pickled file in a dictionary
    """
    with open(path, 'rb') as handle:
        u = pickle._Unpickler(handle)
        u.encoding = 'latin1'
        d = u.load()
    return d

def check_coverage(listed_words, mod):
    """Check the coverage of our word embedding over the listed words. Print results.
    
    Parameters
    ----------
    listed_words : pandas dataframe
        dataframe of the "listed word" table from Wordnik
        
    mod : fasttext word embeddings
        model, or the word embeddings from Fasttext
    """
    all_words = listed_words.lcword.values
    unique_words = set(all_words)
    print('Model dictionary size: {}'.format(len(mod.vocab)))
    print('Count of unique listed words: {}'.format(len(unique_words)))
    print('Intersection of the above two: {}'.format(len([w for w in unique_words if w in mod.vocab])))
    print('% of listed words that is in the dictionary: {}'.format(1.0 * len([w for w in unique_words if w in mod.vocab]) / len(unique_words)))
    print('Listings of words (non-unique) that are on our model vocabulary: {}'.format(1.0*len([w for w in all_words if w in mod.vocab])/len(all_words)))
    
def find_words(input_words, mod, n_outputs=1):
    """Given an input list of words, find recommended words based on similarities.
    
    Parameters
    ----------
    input_words : list
        list of words that the recommendation will be based on
        
    mod : fasttest word embedding
    
    n_outputs: integer
        the number of recommended (most similar) words wanted
    
    Returns
    -------
    out : list
        list of words recommended
    
    """
    out = [w[0] for w in mod.most_similar([w for w in input_words if w in mod.vocab],topn=n_outputs) if w[0].isalpha() and w[0] not in input_words][:n_outputs]
    return out

def find_rank(lst, dic):
    """Given a list of (recommended) words, rank them based on popularity.
    
    Parameters
    ----------
    lst : list
        list of words
    dic : dictionary
        this is the "popularity dictionary", which shows the number of times each word is searched/tagged/listed
    
    Returns
    -------
    df : pandas dataframe
        table containing the original list of words, and their popularity ranking
    """
    lsty = [w for w in lst if w in dic]
    lstn = [w for w in lst if w not in dic]
    df = pd.DataFrame([dic[w] for w in lsty], index = lsty, columns = ['cnt'])
    df = pd.concat([df,pd.DataFrame([0]*len(lstn), index = lstn, columns = ['cnt'])])
    df['rank'] = df.cnt.rank(ascending=False)
    return df

def filter_words(input_, lst):
    """Filter out words that have the same stem
    
    Parameters
    ----------
    input_ : list
        list of input words
        
    lst : list
        list of recommended words
    
    Returns
    -------
    out : list
        list of trimmed down version of lst that does not contain words that share same stems with input_
    """
    out = [w for w in lst if stemmer.stem(w) not in [stemmer.stem(i) for i in input_]]
    return out

def find_filter_rank_words(input_, dic1, dic2, dic3, mod, n_outputs_ = 5, multiple = 5):
    """Given list of input words, find a list of recommended words based on similarity, then rank tham based on 3 Popularity measures.
    
    Parameters
    ----------
    input_ : list
        input words from user
        
    dic1, dic2, dic3 : dictionary
        dictionaries of word popularity
        
    mod : fasttext word embeddings
    
    n_output_ : integer
        the number of words in the final recommendation, after filtering and ranking
        
    multiple : integer
        scalar multiplied to n_output_ to get the number of most similar words before the filtering by popularity
        
    Returns
    -------
    rec : list
        list of final recommendation of words
    """
    lst_ = find_words(input_, mod, n_outputs = n_outputs_*multiple)
    lst = filter_words(input_, lst_)
    rank_l = find_rank(lst, dic = dic1)
    rank_s = find_rank(lst, dic = dic2)
    rank_t = find_rank(lst, dic = dic3)
    df = pd.concat([rank_l, rank_s, rank_t], axis=1).reset_index()
    df['rank_combine'] = df['rank'].sum(axis=1)
    df['stem'] = df['index'].apply(lambda x: stemmer.stem(x))
    df.drop_duplicates(['stem'],inplace=True)
    rec = df.sort_values('rank_combine').iloc[:n_outputs_]['index'].values
    return rec

def predict_(s, mod):
    """Given a word, find its embedding. This function is used in List Recommendation.
    """
    try:
        return mod[s]
    except:
        return np.zeros(300)
    
def averaging(arrays):
    """Given an array of word embeddings vectors, find the average of all vectors. This function is used in List Recommendation.
    """
    arrays_ = pd.Series(arrays).apply(lambda x: np.reshape(x, (-1, 1)))
    return np.mean(np.concatenate(arrays_.values, axis=1), axis=1)

def find_lists(test, list_vecs, listed_words, word_lists, mod, n_outputs=3):
    """Given a list of words, recommend a list from the existing Wordnik lists
    """
    test_arrays = pd.Series(test).apply(lambda x: predict_(x, mod))
    test_avg = averaging(test_arrays)
    dists = {}
    for wordlist in list_vecs:
        dist = 1 - scipy.spatial.distance.cosine(test_avg, list_vecs[wordlist])
        if dist >= 0 and dist <= 1:
            dists[dist] = wordlist
    ds = sorted(dists.keys(), reverse=True)[:n_outputs]
    lists = [dists[d] for d in ds]
    for bestlist in lists:
        list_name = word_lists[word_lists._id==bestlist].name
        creator = word_lists[word_lists._id==bestlist].createdBy
        listed_words_ = listed_words[listed_words.wordListId==bestlist].lcword.unique()
        print('\n')
        print('List name: {}'.format(list_name.iloc[0]))
        print('Creator: {}'.format(creator.iloc[0]))
        print('Words in list: {}'.format(listed_words_))

## Loading data and model (word embedding)

In [11]:
model = gensim.models.KeyedVectors.load_word2vec_format('./model/wiki.en/wiki.en.vec', binary=False)
# upload word embedding from fasttext "wiki.en"

In [12]:
listed_words = get_clean_data('new_valid_list_data/valid_listed_words')
# pandas dataframe of Wordnik table of listed words
word_lists = get_clean_data('new_valid_list_data/valid_list_metadata')
# pandas dataframe of Wordnik table of word lists
list_vecs = get_pickle('listvecs.pickle')
# dictionary of vector representation of all the existing word lists (average of the embeddings of all the words in the list)
# key : id of the list; value : vector of embedding

In [13]:
pop_listed = dict(listed_words.lcword.value_counts())
pop_search = get_json('word_cnts.json')
pop_tag = get_json('word_numtags_map.json')
# these 3 dictionaries show the number of times every word is searched/tagged/listed

## EDA: Checking vocubulary coverage

In [14]:
check_coverage(listed_words, mod = model)

Model dictionary size: 2519370
Count of unique listed words: 484355
Intersection of the above two: 151439
% of listed words that is in the dictionary: 0.31266116794499904
Listings of words (non-unique) that are on our model vocabulary: 0.698293419856025


## Recommending New Words

In [15]:
test = ['delta','analytics','data','analysis','model','analyze','statistics','database']

In [16]:
find_filter_rank_words(test, pop_listed, pop_search, pop_tag, mod = model, n_outputs_ = 10, multiple = 5)

NameError: name 'pit' is not defined

In [None]:
find_filter_rank_words(test, pop_listed, pop_search, pop_tag, mod = model, n_outputs_ = 10, multiple = 10)

## Recommending Existing Lists

In [None]:
find_lists(test, list_vecs, listed_words, word_lists, model, n_outputs=3)