In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import json
import unicodedata
from nltk.stem.porter import *
stemmer = PorterStemmer()
import os
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gc

In [None]:
from gensim.models.wrappers import FastText

In [None]:
# load and store the word2vec model in binary format for fasting loading in the future
# model = KeyedVectors.load_word2vec_format('../wiki-news-300d-1M.vec')
# model.save_word2vec_format('../wiki-news-300d-1M.vec.bin', binary=True)

In [4]:
model = KeyedVectors.load_word2vec_format('../wiki-news-300d-1M.vec.bin', binary=True)

In [8]:
model.most_similar('schadenfreude', topn=1)

[('Schadenfreude', 0.7849401831626892)]

## Create train and val data ##

In [2]:
# # needed to consider splitting words when there's only one list in word
data_dir = 'H:\\Users\\noyana\\Documents\\Projects\\wordnik'
valid_listed_word_path = os.path.join(*[data_dir, 'new_valid_list_data', 'new_valid_list_data', 'valid_listed_words'])
valid_listed_words = []
with open(valid_listed_word_path, 'r', encoding="utf8") as f:
    valid_listed_words = json.load(f)    

In [4]:
all_listed_words = defaultdict(list)
for w in valid_listed_words:
    wordListId = w['wordListId']['$numberLong']
    creatorId = w['creatorId']['$numberLong']
    #word = stemmer.stem(w['word'])
    
    # replace all unicode space \xa0 with space
    word = unicodedata.normalize('NFKD', w['word']) 
    if len(re.sub('[0-9]|~|!|@|#|\$|%|\^|&|\*|\(|\)|-|_|\+|=|[|{|]|];|:|\"|\'|,|<|>|\.|\/|\?|\\\\|\|', '', word)) != len(word):
        continue
    all_listed_words[int(wordListId)].append(word)

In [5]:
len(all_listed_words)

30620

In [6]:
# filter out word lists that contain only one word
word_lists = [wl for wordlistId, wl in all_listed_words.items() if len(wl) > 1]

In [7]:
word_lists[0]

['phatic',
 'macerate',
 'amanuenses',
 'theophagy',
 'seraglio',
 'geophagy',
 'metaphone',
 'anastrophe',
 'neologism',
 'tetragrammaton',
 'bête noire',
 'ablutophobia',
 'picayune',
 'colophon',
 'huzzah',
 'embiggen',
 'steganography',
 'breezer',
 'consigliere',
 'polari',
 'mook',
 'synechdoche',
 'shrubbery',
 'interrobang',
 'nychthemeron',
 'lagniappe',
 'piss and vinegar',
 'poetaster',
 'shoegazer',
 'errata',
 'bollocks',
 'bookmarklet',
 'titification',
 'psychopomp',
 'gloaming',
 'dirigible',
 'twee',
 'epeolatry',
 'strappleberry',
 'hemidemisemiquaver',
 'prepicenter',
 'faineant',
 'misandrist',
 'festivus',
 'cephalophore',
 'noosphere',
 'taw',
 'fulking',
 'ghoti',
 'bracket',
 'smurph',
 'refenestration',
 'westing',
 'provocateur',
 'pâté',
 'pâte',
 'pate',
 'john',
 'clown',
 'poutine',
 'hierophant',
 'bivy',
 'nonce',
 'natches',
 'bummalo',
 'entomologist',
 'etymologist',
 'headword',
 'rfe',
 'mitzvah',
 'anhedonia',
 'piker',
 'surcease',
 'hedcut',


In [8]:
del all_listed_words
del valid_listed_words
gc.collect()

0

## split into train and test ##

In [9]:
#word_lists = [wl for wl in word_lists.items() if len(wl) > 4]
tmp_list = [
    ['michael', 'romeo', 'juliet', 'flowers', 'poet', 'shakespear'],
    ['goodday', 'french', 'spanish', 'table', 'golden'],
    ['harvard', 'machine', 'learning']
]
train_list = []
test_list = []

for wl in word_lists:
    if len(wl) < 4:
        continue
    else:
        test_list.append(wl.pop())
        train_list.append(wl)

print(len(test_list))
print(len(train_list))


28208
28208


In [10]:
with open('train_list.json', 'w') as f:
    json.dump(train_list, f)
with open('test_list.json', 'w') as f:
    json.dump(test_list, f)
    
del word_lists
gc.collect()

64

In [None]:
# write lists to json
# with open('word_lists.json', 'w') as f:
#     json.dump(word_lists, f)

## Create evaluation metric ##

In [20]:
def eval_rec_word(rec_word_list, test_word):
    """ assumes recommendation function recommends several words
    see if any of the recommended word (after stemming) matches the test word"""
    stem_test_word = stemmer.stem(test_word)
    stem_rec_word = list(map(lambda w: stemmer.stem(w), rec_word_list))
    
    # find number of words in recommended word matching test word
    match_test_word = list(filter(lambda w: w == stem_test_word, stem_rec_word))
    score = len(match_test_word)/len(stem_rec_word)

    return score

eval_rec_word(['studded', 'played', 'numbers'], 'playing')

play
['stud', 'play', 'number']
['play'] 0.3333333333333333


## Create word count and tag count in word lists ##

In [None]:
word_cnt_map = {}
for wl in word_lists:
    for word in wl:
        word_cnt_map[word] = word_cnt_map.get(stemmer.stem(word), 0) + 1


In [None]:
# inspect top 20 words listed in word lists
sorted_words = sorted(word_cnt_map.items(), key=lambda kv: kv[1], reverse=True)
sorted_words[0:19]

In [None]:
# read tags
with open('../clean_data/allTaggedItems-06Jan2019.json', 'rb') as f:
    tagitem = json.load(f)
    
word_numtags_map = {}
for i in tagitem:
    # get default word form
    word = stemmer.stem(i['object_id'])
    # replace all unicode space \xa0 with space
    word = unicodedata.normalize('NFKD', word) 
    # remove words that are not composed by alphabets (spaces are ok)
    if len(re.sub('[0-9]|~|!|@|#|\$|%|\^|&|\*|\(|\)|-|_|\+|=|[|{|]|];|:|\"|\'|,|<|>|\.|\/|\?|\\\\|\|', '', word)) != len(word):
        continue
    word_numtags_map[word] = word_numtags_map.get(word, 0) + 1
    
with open('../clean_data/word_numtags_map.json', 'w') as f:
    json.dump(word_numtags_map, f)
del tagitem

In [None]:
# inspect top 20 words being tagged
sorted_wordtags = sorted(word_numtags_map.items(), key=lambda kv: kv[1], reverse=True)
sorted_wordtags[0:19]

In [None]:
del sorted_words
del sorted_wordtags

## Word2Vec's Limitation: Word2Vec may return the exact same word as the query word ##

In [None]:
model = KeyedVectors.load_word2vec_format('../wiki-news-300d-1M.vec.bin', binary=True)

In [None]:
model.most_similar('schadenfreude', topn=1)

In [None]:
# word2vec may return the exact same word
model.most_similar('schadenfreud')

In [None]:
model.most_similar('fracked')

## Word recommender that filteres similar words and rank remaining words ##

In [None]:
# Find similar words in a word list
def get_wl_word2vec(model, wl, debug=True):
    """Take average word2vec for each word in word list"""
    wl_w2vec = []
    # for each word get word2vec representation
    
    for w in wl:
        try:
            wvec = model[w]
            wl_w2vec.append(wvec)
        except:
            continue
    
    if len(wl_w2vec) == 0:
        return np.zeros((300))
    else:
        return np.mean(np.asarray(wl_w2vec), axis=0)


def get_sim_words(model, wl, topn, debug=True):
    # retrieve the word2vec representation of the word list
    wl_w2vec = get_wl_word2vec(model, wl, debug=debug)
    
    # return top n similar words
    return model.similar_by_vector(wl_w2vec, topn = topn)

In [None]:
# rank all the return words
def rank_words(stemmer, wl, word_numtags_map, word_cnt_map, word_cnt_weight=0.8, word_tag_weight=0.2, debug=True):
    assert(len(wl) > 0)
    def score_word(w):
        stem_w = stemmer.stem(w)
        word_cnt = word_cnt_map.get(stem_w, 0)
        word_numtag = word_numtags_map.get(stem_w, 0)
        score = word_cnt * word_cnt_weight + word_numtag * word_tag_weight
        return score
        
    wl_scores = list(map(lambda w: score_word(w), wl))
    maxidx = np.argmax(wl_scores)
    if debug:
        print(wl_scores, maxidx)
    return wl[np.argmax(wl_scores)]

In [None]:
def filter_unique(stemmer, wl, sim_wl):
    """ Remove similar words that share the same words in word list"""
    stem_wl = set(map(lambda w: stemmer.stem(w), wl))
    return list(filter(lambda w: stemmer.stem(w) not in stem_wl, sim_wl))

In [None]:
def recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, topn=10, word_cnt_weight=0.8, word_tag_weight=0.2, debug=True):    
    """Recommand a word to add based on word list"""
    assert len(wl) > 0, "Length of word list needs to be greater than 1"
    
    # find similar words to word list
    sim_wl = get_sim_words(model, wl, topn=10, debug=debug)
    np_sim_wl = np.array(sim_wl)[:,0]
    if debug:
        print('similar word list', np_sim_wl)

    # filter duplicating words
    filtered_wl = filter_unique(stemmer, wl, np_sim_wl)
    if debug:
        print('Filtered word list', filtered_wl)

    # rank words
    if len(filtered_wl) == 0:
        top_w = rank_words(stemmer, wl, word_numtags_map, word_cnt_map, word_cnt_weight=0, word_tag_weight=1, debug=debug)
    else:
        top_w = rank_words(stemmer, filtered_wl, word_numtags_map, word_cnt_map, word_cnt_weight=0, word_tag_weight=1, debug=debug)
    return top_w

In [None]:
wl = ['esper', 'espers', 'B.A.B.E.L.', 'Hyōbu' ,'magic-user', 'kekkai',
 'shapechanging', 'magic-using', 'paopei', 'meta-human']
#wl = ['schadenfreude', 'ephemeral']
recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, topn=10, word_cnt_weight=0.8, word_tag_weight=0.2)

In [None]:
pred_words = list(map(lambda wl: recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, debug=False), word_lists))

In [None]:
def eval_pred(model, word_cnt_map, word_numtag_map, pred_w, true_w):
    """
    Performs 3 metrics evaluations
    1. cosine similarity between predicted word versus true word. range -1~1
    2. word count difference percentage. 0~1
    3. tag count difference percentage. 0~1
    """
    try:
        pred_w2vec = model[pred_w]
    except:
        pred_w2vec = np.zeros((300))
    try:
        true_w2vec = model[true_w]
    except:
        true_w2vec = np.zeros((300))
    if not np.any(pred_w2vec) or not np.any(true_w2vec):
        cos_sim = 0
    else:
        cos_sim = np.dot(pred_w2vec, true_w2vec)/(np.linalg.norm(pred_w2vec) * np.linalg.norm(true_w2vec))
    
    try:
        true_w_wcnt = word_cnt_map[true_w]
        pred_w_wcnt = word_cnt_map.get(pred_w, 0)
        print('true_w_wcnt', true_w_wcnt, 'pred_w_wcnt', pred_w_wcnt)
        cnt_sim = abs(true_w_wcnt - pred_w_wcnt)/true_w_wcnt
    except:
        cnt_sim = 0
    
    try:
        true_w_tagcnt = word_cnt_map[true_w]
        pred_w_tagcnt = word_cnt_map.get(pred_w, 0)
        print('true_w_tagcnt', true_w_tagcnt, 'pred_w_tagcnt', pred_w_tagcnt)
        tagcnt_sim = abs(true_w_tagcnt - pred_w_tagcnt)/true_w_tagcnt
    except:
        tagcnt_sim = 0

    print('Cosine similarity', cos_sim, 'cnt_sim', cnt_sim, 'tagcnt_sim', tagcnt_sim)

In [None]:
true_w = word_lists[100][-1]
wl_len = len(word_lists[100])
wl = word_lists[100][0:wl_len-1]
print('Word List:', wl)
print('Missing word', true_w)
pred_w = recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, topn=10, word_cnt_weight=0.8, word_tag_weight=0.2)
print('predicted word', pred_w)
# evaluate prediction
eval_pred(model, word_cnt_map, word_numtags_map, pred_w, true_w)