# Importing Python libraries and project datasets

In [199]:
import pandas as pd
import numpy as np
import scipy
import gensim
import pickle

In [209]:
path = 'new_valid_list_data/valid_listed_words'
df = pd.read_json(path)
for c in df.columns:
    df[c] = df[c].apply(lambda x: x.values()[0] if type(x) == dict else x)

path1 = 'new_valid_list_data/valid_list_metadata'
df1 = pd.read_json(path1)
for c in df1.columns:
    df1[c] = df1[c].apply(lambda x: x.values()[0] if type(x) == dict else x)

# Importing pretrained models

model = FastText embeddings trained on English Wikipedia documents

model2 = Word2Vec embeddings trained on Google News documents

In [12]:
model = gensim.models.KeyedVectors.load_word2vec_format('./model/wiki.en/wiki.en.vec', binary=False)

In [154]:
model2 = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

# EDA: Checking vocubulary coverage

In [None]:
all_words = df.lcword.unique()

In [239]:
def check_coverage(all_words=all_words, mod = model):
    print('Model dictionary size: {}'.format(len(mod.vocab)))
    print('Count of unique listed words: {}'.format(len(all_words)))
    print('Intersection of the above two: {}'.format(len([w for w in all_words if w in mod.vocab])))
    print('% of listed words that is in the dictionary: {}'.format(1.0 * len([w for w in all_words if w in mod.vocab]) / len(all_words)))

### Coverage of FastText pretrained model

In [240]:
check_coverage(mod = model)

Model dictionary size: 2519370
Count of unique listed words: 484355
Intersection of the above two: 151439
% of listed words that is in the dictionary: 0.312661167945


### Coverage of Word2Vec pretrained model

In [241]:
check_coverage(mod = model2)

Model dictionary size: 3000000
Count of unique listed words: 484355
Intersection of the above two: 76517
% of listed words that is in the dictionary: 0.157977103571


# Recommendation 1

Input:  List of words with length >= 1

Output: List of recommended words with length = "n_outputs"; 

Recommended words are selected from the whole pretrained dictionary based on Similarity to Input


In [243]:
def find_words(input_words = ['testing'], n_outputs = 1, mod = model):
    return [w[0] for w in mod.most_similar([w for w in input_words if w in mod.vocab],topn=n_outputs+10) if w[0].isalpha() and w[0] not in list(input_words)][:n_outputs]


In [250]:
test = ['data','computer','engineering','software','machine','computing']

### Testing of Recommendation 1 

In [251]:
find_words(test, n_outputs=1, mod = model)

[u'computery']

# Recommendation 2

Input:  List of words with length >= 1

Output: List of recommended wordlists with length = n_outputs

In [178]:
with open('listvecs.pickle', 'rb') as handle:
    dic = pickle.load(handle)

In [253]:
def predict_(s, mod=model):
    try:
        return mod[s]
    except:
        return np.zeros(300)
def averaging(arrays):
    arrays_ = pd.Series(arrays).apply(lambda x: np.reshape(x, (-1, 1)))
    return np.mean(np.concatenate(arrays_.values, axis=1), axis=1)
def find_lists(test, n_outputs=3, dic=dic):
#     test = s.translate(None, string.punctuation).split()
    test_arrays = pd.Series(test).apply(lambda x: predict_(x))
    test_avg = averaging(test_arrays)
    dists = {}
    for wordlist in dic:
        dist = 1 - scipy.spatial.distance.cosine(test_avg, dic[wordlist])
        if dist >= 0 and dist <= 1:
            dists[dist] = wordlist
    ds = sorted(dists.keys(), reverse=True)[:n_outputs]
#     print(ds)
    lists = [dists[d] for d in ds]
    for bestlist in lists:
        list_name = df1_[df1_._id==bestlist].name
        creator = df1_[df1_._id==bestlist].createdBy
        listed_words = df[df.wordListId==bestlist].lcword.unique()
        print('\n')
        print('List name: {}'.format(list_name.iloc[0]))
        print('Creator: {}'.format(creator.iloc[0]))
        print('Words in list: {}'.format(listed_words))

### Testing of Recommendation 2

In [254]:
find_lists(test, n_outputs=5)



List name: Gleichen
Creator: heidikraut
Words in list: [u'information' u'computer' u'manipulation' u'hand']


List name: computer words
Creator: edwardvielmetti
Words in list: [u'computer' u'floppy disk' u'keyboard' u'feed' u'memex' u'recursive'
 u'kernel' u'debug' u'dogfooding' u'bootstrapping' u'ansible']


List name: Technofancy
Creator: pliu32
Words in list: [u'chipset' u'kernel' u'motherboard' u'network' u'bios' u'microprocessor'
 u'ram' u'rom' u'register' u'pipelining' u'smartphone' u'programmable'
 u'laptop' u'wireless' u'anti-aliasing' u'anisotropic' u'latency' u'lag'
 u'ping' u'deinterlace' u'spatiotemporal' u'biometric' u'encryption'
 u'authentication' u'cipher' u'hypervisor' u'stack' u'heap' u'debug'
 u'pointer' u'dereference' u'operand' u'packet' u'firewall'
 u'multiplexing' u'symlink']


List name: tech words
Creator: paulosuzart
Words in list: [u'soa' u'environment' u'production' u'architecture' u'architect'
 u'language' u'java' u'application' u'integration' u'deploy' u