In [7]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import json
import unicodedata
from nltk.stem.porter import *
stemmer = PorterStemmer()
import os
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gc

In [4]:
from gensim.models.wrappers import FastText

In [6]:
# load and store the word2vec model in binary format for fasting loading in the future
# model = KeyedVectors.load_word2vec_format('../wiki-news-300d-1M.vec')
# model.save_word2vec_format('../wiki-news-300d-1M.vec.bin', binary=True)

In [4]:
model = KeyedVectors.load_word2vec_format('../wiki-news-300d-1M.vec.bin', binary=True)

In [8]:
model.most_similar('schadenfreude', topn=1)

[('Schadenfreude', 0.7849401831626892)]

## Create train and val data ##

In [13]:
# # needed to consider splitting words when there's only one list in word
data_dir = os.getcwd()
valid_listed_word_path = os.path.join(*[data_dir, 'new_valid_list_data', 'valid_listed_words'])
valid_listed_words = []
with open(valid_listed_word_path, 'r', encoding="utf8") as f:
    valid_listed_words = json.load(f)    

In [14]:
all_listed_words = defaultdict(list)
for w in valid_listed_words:
    wordListId = w['wordListId']['$numberLong']
    creatorId = w['creatorId']['$numberLong']
    word = stemmer.stem(w['word'])
    
    # replace all unicode space \xa0 with space
    word = unicodedata.normalize('NFKD', word) 
    if len(re.sub('[0-9]|~|!|@|#|\$|%|\^|&|\*|\(|\)|-|_|\+|=|[|{|]|];|:|\"|\'|,|<|>|\.|\/|\?|\\\\|\|', '', word)) != len(word):
        continue
    all_listed_words[int(wordListId)].append(word)

In [15]:
len(all_listed_words)

30620

In [16]:
# filter out word lists that contain only one word
word_lists = [wl for wordlistId, wl in all_listed_words.items() if len(wl) > 1]
del all_listed_words
gc.collect()

0

In [17]:
len(word_lists)

30541

In [18]:
# write lists to json
with open('word_lists.json', 'w') as f:
    json.dump(word_lists, f)

## Create word count and tag count in word lists ##

In [19]:
word_cnt_map = {}
for wl in word_lists:
    for word in wl:
        word_cnt_map[word] = word_cnt_map.get(stemmer.stem(word), 0) + 1


In [20]:
# inspect top 20 words listed in word lists
sorted_words = sorted(word_cnt_map.items(), key=lambda kv: kv[1], reverse=True)
sorted_words[0:19]

[('schadenfreud', 559),
 ('defenestr', 530),
 ('sanguin', 524),
 ('quixot', 523),
 ('lugubri', 509),
 ('melliflu', 507),
 ('love', 488),
 ('obsequi', 487),
 ('insouci', 479),
 ('loquaci', 476),
 ('lacon', 459),
 ('loveli', 443),
 ('ennui', 440),
 ('ether', 436),
 ('serendip', 433),
 ('caprici', 428),
 ('inchoat', 422),
 ('loving', 409),
 ('serendipiti', 408)]

In [26]:
# read tags
with open('../clean_data/allTaggedItems-06Jan2019.json', 'rb') as f:
    tagitem = json.load(f)
    
word_numtags_map = {}
for i in tagitem:
    # get default word form
    word = stemmer.stem(i['object_id'])
    # replace all unicode space \xa0 with space
    word = unicodedata.normalize('NFKD', word) 
    # remove words that are not composed by alphabets (spaces are ok)
    if len(re.sub('[0-9]|~|!|@|#|\$|%|\^|&|\*|\(|\)|-|_|\+|=|[|{|]|];|:|\"|\'|,|<|>|\.|\/|\?|\\\\|\|', '', word)) != len(word):
        continue
    word_numtags_map[word] = word_numtags_map.get(word, 0) + 1


In [29]:
del tagitem
gc.collect()

49

In [30]:
# inspect top 20 words being tagged
sorted_wordtags = sorted(word_numtags_map.items(), key=lambda kv: kv[1], reverse=True)
sorted_wordtags[0:19]

[('admir', 214),
 ('about', 201),
 ('adobeair', 197),
 ('addon', 195),
 ('nectareousraconteusenectarouscourtesan', 96),
 ('love', 58),
 ('overtag', 53),
 ('red', 49),
 ('divers', 48),
 ('object', 45),
 ('good', 38),
 ('wordnik', 35),
 ('beauti', 34),
 ('fistfuck', 32),
 ('isi', 32),
 ('reflect', 30),
 ('project', 29),
 ('express', 29),
 ('fuck', 28)]

In [31]:
del sorted_words
del sorted_wordtags
gc.collect()

0

## Word2Vec's Limitation: Word2Vec may return the exact same word as the query word ##

In [33]:
# word2vec may return the exact same word
model.most_similar('schadenfreude')

[('Schadenfreude', 0.7849401831626892),
 ('epicaricacy', 0.6519575715065002),
 ('gloating', 0.594894528388977),
 ('smugness', 0.5815554857254028),
 ('glee', 0.5808501243591309),
 ('weltschmerz', 0.5751596093177795),
 ('hate-watching', 0.5631588697433472),
 ('irony', 0.556583821773529),
 ('self-mockery', 0.5543162822723389),
 ('pearl-clutching', 0.5435289144515991)]

In [34]:
model.most_similar('fracked')

[('frack', 0.7199307680130005),
 ('fracks', 0.6991308927536011),
 ('fracking', 0.6949862241744995),
 ('frackers', 0.6732550859451294),
 ('fraccing', 0.659175455570221),
 ('Fracked', 0.6529004573822021),
 ('fracing', 0.6463837027549744),
 ('Fracking', 0.6427013278007507),
 ('shale-gas', 0.6260367631912231),
 ('fraking', 0.6246780157089233)]

## Word recommender that filteres similar words and rank remaining words ##

In [35]:
# Find similar words in a word list
def get_wl_word2vec(model, wl, debug=True):
    """Take average word2vec for each word in word list"""
    wl_w2vec = []
    # for each word get word2vec representation
    
    for w in wl:
        try:
            wvec = model[w]
            wl_w2vec.append(wvec)
        except:
            continue
    
    if len(wl_w2vec) == 0:
        return np.zeros((300))
    else:
        return np.mean(np.asarray(wl_w2vec), axis=0)


def get_sim_words(model, wl, topn, debug=True):
    # retrieve the word2vec representation of the word list
    wl_w2vec = get_wl_word2vec(model, wl, debug=debug)
    
    # return top n similar words
    return model.similar_by_vector(wl_w2vec, topn = topn)

In [36]:
# rank all the return words
def rank_words(stemmer, wl, word_numtags_map, word_cnt_map, word_cnt_weight=0.8, word_tag_weight=0.2, debug=True):
    assert(len(wl) > 0)
    def score_word(w):
        stem_w = stemmer.stem(w)
        word_cnt = word_cnt_map.get(stem_w, 0)
        word_numtag = word_numtags_map.get(stem_w, 0)
        score = word_cnt * word_cnt_weight + word_numtag * word_tag_weight
        return score
        
    wl_scores = list(map(lambda w: score_word(w), wl))
    maxidx = np.argmax(wl_scores)
    if debug:
        print(wl_scores, maxidx)
    return wl[np.argmax(wl_scores)]

In [37]:
def filter_unique(stemmer, wl, sim_wl):
    """ Remove similar words that share the same words in word list"""
    stem_wl = set(map(lambda w: stemmer.stem(w), wl))
    return list(filter(lambda w: stemmer.stem(w) not in stem_wl, sim_wl))

In [38]:
def recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, topn=10, word_cnt_weight=0.8, word_tag_weight=0.2, debug=True):    
    """Recommand a word to add based on word list"""
    assert len(wl) > 0, "Length of word list needs to be greater than 1"
    
    # find similar words to word list
    sim_wl = get_sim_words(model, wl, topn=10, debug=debug)
    np_sim_wl = np.array(sim_wl)[:,0]
    if debug:
        print('similar word list', np_sim_wl)

    # filter duplicating words
    filtered_wl = filter_unique(stemmer, wl, np_sim_wl)
    if debug:
        print('Filtered word list', filtered_wl)

    # rank words
    if len(filtered_wl) == 0:
        top_w = rank_words(stemmer, wl, word_numtags_map, word_cnt_map, word_cnt_weight=0, word_tag_weight=1, debug=debug)
    else:
        top_w = rank_words(stemmer, filtered_wl, word_numtags_map, word_cnt_map, word_cnt_weight=0, word_tag_weight=1, debug=debug)
    return top_w

In [39]:
wl = ['esper', 'espers', 'B.A.B.E.L.', 'Hyōbu' ,'magic-user', 'kekkai',
 'shapechanging', 'magic-using', 'paopei', 'meta-human']
#wl = ['schadenfreude', 'ephemeral']
recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, topn=10, word_cnt_weight=0.8, word_tag_weight=0.2)

similar word list ['esper' 'shapechanging' 'espers' 'magic-using' 'kekkai' 'paopei' 'Hyōbu'
 'magic-user' 'B.A.B.E.L.' 'Tokine']
Filtered word list ['Tokine']
[0] 0


'Tokine'

In [40]:
pred_words = list(map(lambda wl: recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, debug=False), word_lists))

KeyboardInterrupt: 

In [50]:
def eval_pred(model, word_cnt_map, word_numtag_map, pred_w, true_w):
    """
    Performs 3 metrics evaluations
    1. cosine similarity between predicted word versus true word. range -1~1
    2. word count difference percentage. 0~1
    3. tag count difference percentage. 0~1
    """
    try:
        pred_w2vec = model[pred_w]
    except:
        pred_w2vec = np.zeros((300))
    try:
        true_w2vec = model[true_w]
    except:
        true_w2vec = np.zeros((300))
    if not np.any(pred_w2vec) or not np.any(true_w2vec):
        cos_sim = 0
    else:
        cos_sim = np.dot(pred_w2vec, true_w2vec)/(np.linalg.norm(pred_w2vec) * np.linalg.norm(true_w2vec))
    
    try:
        true_w_wcnt = word_cnt_map[true_w]
        pred_w_wcnt = word_cnt_map.get(pred_w, 0)
        print('true_w_wcnt', true_w_wcnt, 'pred_w_wcnt', pred_w_wcnt)
        cnt_sim = abs(true_w_wcnt - pred_w_wcnt)/true_w_wcnt
    except:
        cnt_sim = 0
    
    try:
        true_w_tagcnt = word_cnt_map[true_w]
        pred_w_tagcnt = word_cnt_map.get(pred_w, 0)
        print('true_w_tagcnt', true_w_tagcnt, 'pred_w_tagcnt', pred_w_tagcnt)
        tagcnt_sim = abs(true_w_tagcnt - pred_w_tagcnt)/true_w_tagcnt
    except:
        tagcnt_sim = 0

    print('Cosine similarity', cos_sim, 'cnt_sim', cnt_sim, 'tagcnt_sim', tagcnt_sim)

In [51]:
true_w = word_lists[100][-1]
wl_len = len(word_lists[100])
wl = word_lists[100][0:wl_len-1]
print('Word List:', wl)
print('Missing word', true_w)
pred_w = recommend_words(model, stemmer, wl, word_numtags_map, word_cnt_map, topn=10, word_cnt_weight=0.8, word_tag_weight=0.2)
print('predicted word', pred_w)
# evaluate prediction
eval_pred(model, word_cnt_map, word_numtags_map, pred_w, true_w)

Word List: ['juggernaut', 'jute', 'bungalow', 'pajama', 'loot', 'pundit', 'sherbet', 'avatar', 'guru']
Missing word cushi
similar word list ['pundit' 'guru' 'bungalow' 'pajama' 'outfit' 'outfits' 'juggernaut'
 'avatar' 'superstar' 'gurus']
Filtered word list ['outfit', 'outfits', 'superstar']
[0, 0, 2] 2
predicted word superstar
true_w_wcnt 10 pred_w_wcnt 9
true_w_tagcnt 10 pred_w_tagcnt 9
Cosine similarity 0 cnt_sim 0.1 tagcnt_sim 0.1
