# Word Recommenders Based on Word2Vec and FastText Word Embeddings

In [0]:
import pandas as pd
import numpy as np
import scipy
import gensim
import pickle
import json
import os
import zipfile
import gc
from nltk.stem.porter import *
stemmer = PorterStemmer()

### Unzip data and pretrained models from google drive to running directory ###

Skip this step if data is elsewhere

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
delta_dir = '/content/gdrive/My Drive/Delta_Analytics'

In [0]:
with zipfile.ZipFile(os.path.join(delta_dir, 'wiki-news-300d-1M.vec.bin.zip'), 'r') as zip_ref:
    zip_ref.extractall(os.getcwd())

In [0]:
with zipfile.ZipFile(os.path.join(delta_dir, 'train_test_list.zip'), 'r') as zip_ref:
    zip_ref.extractall(os.getcwd())
    
with zipfile.ZipFile(os.path.join(delta_dir, 'new_valid_list_data.zip'), 'r') as zip_ref:
    zip_ref.extractall(os.getcwd())
    
with zipfile.ZipFile(os.path.join(delta_dir, 'word_numtags_map.json.zip'), 'r') as zip_ref:
    zip_ref.extractall(os.getcwd())
    
with zipfile.ZipFile(os.path.join(delta_dir, 'word_cnts.json.zip'), 'r') as zip_ref:
    zip_ref.extractall(os.getcwd())

## Importing pretrained models

In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec.bin', binary=True)

In [0]:
def get_clean_data(path):
    df = pd.read_json(path)
    for c in df.columns:
        df[c] = df[c].apply(lambda x: list(x.values())[0] if type(x) == dict else x)
    return df
    
def get_json(path):
    with open(path, 'rb') as f:
        data = json.load(f)
    return data

def get_pickle(path):
    with open(path, 'rb') as handle:
        pic = pickle.load(handle, encoding='latin1')
    return pic

def check_coverage(all_words, mod):
    unique_words = set(all_words)
    print('Model dictionary size: {}'.format(len(mod.vocab)))
    print('Count of unique listed words: {}'.format(len(unique_words)))
    print('Intersection of the above two: {}'.format(len([w for w in unique_words if w in mod.vocab])))
    print('% of listed words that is in the dictionary: {}'.format(1.0 * len([w for w in unique_words if w in mod.vocab]) / len(unique_words)))
    print('Listings of words (non-unique) that are on our model vocabulary: {}'.format(1.0*len([w for w in all_words if w in mod.vocab])/len(all_words)))
    
def find_words(input_words, mod, n_outputs=1):
    filtered_input = [w for w in input_words if w in mod.vocab]
    if len(filtered_input) > 0:
        return [w[0] for w in mod.most_similar(filtered_input,topn=n_outputs) if w[0].isalpha() and w[0] not in input_words][:n_outputs]
    else:
        return None

def find_rank(lst, dic, weight_dic):
    lsty = [w for w in lst if w in dic]
    lstn = [w for w in lst if w not in dic]
    df = pd.DataFrame([dic[w] for w in lsty], index = lsty, columns = ['cnt'])
    df = pd.concat([df,pd.DataFrame([0]*len(lstn), index = lstn, columns = ['cnt'])])
    # we are sorting all the values at the end, why do we need to rank them? And we are concatenating using axis anyway first
    df['rank'] = df.cnt.rank(ascending=False)
    df['weighted_rank'] = df['rank'] * weight_dic
    return df

def filter_words(input_, lst):
    return [w for w in lst if stemmer.stem(w) not in [stemmer.stem(i) for i in input_]]

def find_filter_rank_words(input_, dic1, dic2, dic3, sorted_dic1, sorted_dic2, sorted_dic3, mod, 
                           weight_dic1=0.5, weight_dic2=0.5, weight_dic3=0.5, n_outputs_ = 5, multiple = 5):
    lst_ = find_words(input_, mod, n_outputs = n_outputs_*multiple)
    if lst_ == None:
        lst_ = sorted_dic1[0:n_outputs_] + sorted_dic2[0:n_outputs_] + sorted_dic3[0:n_outputs_]
        lst_ = [w[0] for w in lst_]
    lst = filter_words(input_, lst_)
    rank_l = find_rank(lst, dic1, weight_dic1)
    rank_s = find_rank(lst, dic2, weight_dic2)
    rank_t = find_rank(lst, dic3, weight_dic3)
    df = pd.concat([rank_l, rank_s, rank_t], axis=1).reset_index()
    df['rank_combine'] = df['weighted_rank'].sum(axis=1)
    #print('rank_l', rank_l, 'rank_s', rank_s, 'rank_t', rank_t, df)

    df['stem'] = df['index'].apply(lambda x: stemmer.stem(x))
    df.drop_duplicates(['stem'],inplace=True)
    #return df.sort_values('rank_combine', ascending=False).iloc[:n_outputs_]['index'].values
    return df.sort_values('rank_combine').iloc[:n_outputs_]['index'].values


def predict_(s, mod):
    try:
        return mod[s]
    except:
        return np.zeros(300)
    
def averaging(arrays):
    arrays_ = pd.Series(arrays).apply(lambda x: np.reshape(x, (-1, 1)))
    return np.mean(np.concatenate(arrays_.values, axis=1), axis=1)

def find_lists(test, list_vecs, listed_words, word_lists, mod, n_outputs=3):
    test_arrays = pd.Series(test).apply(lambda x: predict_(x, mod))
    test_avg = averaging(test_arrays)
    dists = {}
    for wordlist in list_vecs:
        dist = 1 - scipy.spatial.distance.cosine(test_avg, list_vecs[wordlist])
        if dist >= 0 and dist <= 1:
            dists[dist] = wordlist
    ds = sorted(dists.keys(), reverse=True)[:n_outputs]
    lists = [dists[d] for d in ds]
    for bestlist in lists:
        list_name = word_lists[word_lists._id==bestlist].name
        creator = word_lists[word_lists._id==bestlist].createdBy
        listed_words_ = listed_words[listed_words.wordListId==bestlist].lcword.unique()
        print('\n')
        print('List name: {}'.format(list_name.iloc[0]))
        print('Creator: {}'.format(creator.iloc[0]))
        print('Words in list: {}'.format(listed_words_))

In [0]:
def eval_rec_word(rec_word_list, test_word):
    """ assumes recommendation function recommends several words
    see if any of the recommended word (after stemming) matches the test word"""
    stem_test_word = stemmer.stem(test_word)
    stem_rec_word = list(map(lambda w: stemmer.stem(w), rec_word_list))
    
    # find number of words in recommended word matching test word
    cum_precision_rank = 0
    num_rel = 0
    for widx, w in enumerate(stem_rec_word):
        if w == stem_test_word:
            num_rel += 1
        cum_precision_rank += num_rel/(widx + 1)
    
    score = cum_precision_rank/num_rel if num_rel != 0 else 0

    return score

In [0]:
listed_words = get_clean_data('new_valid_list_data/valid_listed_words')
pop_listed = dict(listed_words.lcword.value_counts())
pop_search = get_json('word_cnts.json')
pop_tag = get_json('word_numtags_map.json')

In [0]:
sort_dic1 = sorted(pop_search.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
sort_dic1[0:5]

[('', 95066),
 ('upgrade', 88188),
 ('cat', 81693),
 ('usage', 16205),
 ('recent', 15214)]

In [0]:
sort_dic2 = sorted(pop_listed.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
sort_dic2[0:5]

[('schadenfreude', 559),
 ('ephemeral', 515),
 ('quixotic', 505),
 ('lugubrious', 504),
 ('sanguine', 492)]

In [0]:
sort_dic3 = sorted(pop_tag.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
sort_dic3[0:5]

[('admir', 214),
 ('about', 201),
 ('adobeair', 197),
 ('addon', 195),
 ('nectareousraconteusenectarouscourtesan', 96)]

## EDA: Checking vocubulary coverage

In [0]:
all_words = listed_words.lcword.values
check_coverage(all_words, mod = model)

Model dictionary size: 999994
Count of unique listed words: 484355
Intersection of the above two: 98715
% of listed words that is in the dictionary: 0.20380712493935232
Listings of words (non-unique) that are on our model vocabulary: 0.6240700087610888


In [0]:
del listed_words
del all_words
gc.collect()

7

## Recommending New Words

In [0]:
test = ['delta','analytics','data','analysis','model','analyze','statistics','database']

In [0]:
#find_filter_rank_words(test, pop_listed, pop_search, pop_tag, mod = model, n_outputs_ = 10, multiple = 5)
find_filter_rank_words(test, pop_listed, pop_search, pop_tag, sort_dic1, sort_dic2, sort_dic3, mod = model, n_outputs_ = 10, multiple = 10)

  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




array(['paradigm', 'tool', 'approach', 'stuff', 'process', 'function',
       'team', 'logic', 'research', 'method'], dtype=object)

## Caclulate Accuracy for Recommended Words ##

The weights with the highest scores are (pop_listed, pop_search, pop_tag) = (1, 0, 0)
The weights with the second highest score are (pop_listed, pop_search, pop_tag) = (3, 2, 1)

In [0]:
train_lists = get_json('train_list.json')
test_lists = get_json('test_list.json')

In [0]:
num_splits = 10
batch_size = int(len(train_lists)/num_splits)

In [0]:
start_idx = 0
wd1, wd2, wd3 = (0, 0, 0)
accum_score = 0
for split in range(num_splits):
    list_score = {}
    for idx in range(start_idx, start_idx + batch_size):
        rec_word_list = find_filter_rank_words(train_lists[idx], pop_listed, pop_search, pop_tag, sort_dic1, sort_dic2, sort_dic3, mod = model, 
                                               weight_dic1 = wd1, weight_dic2 = wd2, weight_dic3 = wd3, 
                                               n_outputs_ = 10, multiple = 10)
        list_score[idx] = eval_rec_word(rec_word_list, test_lists[idx])
    avg_score = np.sum(list(list_score.values()))/batch_size
    accum_score += avg_score
    print('Batch average precision score is', avg_score)
    with open(os.path.join(*[delta_dir, 'weights_0_0_0', str(split)+'.json']), 'w') as f:
        j = json.dumps(list_score)
        f.write(j)
    # write result to disk
    start_idx += batch_size

  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.i

Batch average precision score is 0.0034961443206124056


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.i

Batch average precision score is 0.012840538106495552


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort 

Batch average precision score is 0.011542693909715187


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if np.issub

In [0]:
 accum_score = 0
for split in range(num_splits):
    with open(os.path.join(*[delta_dir, 'weights_0_0_0', str(split)+'.json']), 'r') as f:
        j = json.load(f)
        accum_score += np.sum(list(j.values()))/batch_size
print('average score', accum_score/num_splits)

average score 0.009859253630530227


In [0]:
idx = 1642
rec_words = find_filter_rank_words(train_lists[idx], pop_listed, pop_search, pop_tag, sort_dic1, sort_dic2, sort_dic3, mod = model, 
                                               weight_dic1 = 1, weight_dic2 = 0, weight_dic3 = 0, 
                                               n_outputs_ = 10, multiple = 10)
print('training lists', train_lists[idx], 'recommended words', rec_words, 'test words', test_lists[idx])


training lists ['agitated', 'inspired', 'detatched', 'baffled', 'beflustered', 'befuddled'] recommended words ['nonplussed' 'incredulous' 'discombobulated' 'indignant' 'despondent'
 'flabbergasted' 'irate' 'aghast' 'bemused' 'flummoxed'] test words perplexed


  if np.issubdtype(vec.dtype, np.int):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [0]:
eval_rec_word(rec_words, test_lists[idx])

1.0956349206349207

training lists ['agitated', 'inspired', 'detatched', 'baffled', 'beflustered', 'befuddled'] recommended words ['aghast' 'curious' 'anxious' 'perplexed' 'bewildered' 'vexed'
 'distraught' 'confused' 'incredulous' 'nonplussed'] test words perplexed

## Recommending Existing Lists

In [0]:
word_lists = get_clean_data('new_valid_list_data/valid_list_metadata')
listed_words = get_clean_data('new_valid_list_data/valid_listed_words')
list_vecs = get_pickle('listvecs.pickle')

In [0]:
find_lists(test, list_vecs, listed_words, word_lists, model, n_outputs=3)