In [38]:
import numpy as np
import tensorflow as tf
from collections import deque
import functools
from time import gmtime, strftime
import seaborn as sns
from os import listdir
from os.path import isfile, join
from scipy.stats.stats import pearsonr
import random as rand
import pyproj
import functools
import pickle 
import matplotlib.pyplot as plt
import pandas
import scipy
import os
import gc
from collections import defaultdict
import random
%matplotlib inline

np.random.seed(0)
tf.set_random_seed(0)


    
def memoize(obj, maxlen = 2000):
    """A decorator to cache advice objects using the advice key"""
    cache = obj.cache = {}
    deck = obj.deck = deque([], maxlen = maxlen)

    @functools.wraps(obj)
    def memoizer(*args, **kwargs):
        key = args[0]
        if key not in cache:
            if len(deck) == deck.maxlen:
              del cache[deck.popleft()[0]]
            temp = obj(*args, **kwargs)
            cache[key] = temp
            deck.append((key, temp))
        return cache[key]
        
    return memoizer


n_samples = 800000
def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)


with open("../training_data/D_cbow_pdw_8B.pkl", 'rb') as f:
    model = pickle.load(f)
    
with open("../training_data/training_data.pkl", 'rb') as f:
    dictionary = pickle.load(f)

for word in model:
    model[word] = np.append(model[word], [0,0,0,0])
    
model["UNK"] = np.zeros([504])
model["UNK"][500]=1
model["GO"] = np.zeros([504])
model["GO"][501]=1
model["STOP"] = np.zeros([504])
model["STOP"][502]=1
model["PAD"] = np.zeros([504])
model["PAD"][503]=1
    

dictionary_dict = defaultdict(list)
for i in range(len(dictionary[0])):
    dictionary_dict[dictionary[0][i]].append(dictionary[1][i])
print ("created fast dictionary")
    


# print (get_hypernyms_to_adjectives("mansion"))
# print (get_hypernyms_to_adjectives("hut"))
# print (get_hypernyms_to_adjectives("hut"))


all_words = list(set(dictionary[0]))
all_words.extend(["PAD","UNK", "STOP",  "GO"])
    
    
    
words = model.keys()
random.shuffle(words)

train_words = words[:int(len(words) * .75)]
val_words = words[int(len(words) * .75) : ]



def get_vec(word):
    if word in model:
        return model[word]
    else:
        return model['UNK']


def cosine_similarity(a, b):
    return np.dot(a,b)/(np.sqrt(np.sum(np.power(a,2)))*np.sqrt(np.sum(np.power(b,2))))

def sort_topn(l, n):
    return list(reversed(sorted(l, key = lambda x: x[1])))[:n]

all_word_vectors = np.zeros([len(all_words), 504])
i=0
for word in all_words:
    all_word_vectors[i,:] = model[word]
    i+=1

norms = np.linalg.norm(all_word_vectors, ord=2, axis=1)
all_word_vectors = all_word_vectors / norms[:,None]
from scipy.spatial import KDTree

kd_forest = []

num_trees = 7
for i in range(num_trees):
    num_words = len(all_words)
    start = num_words/num_trees * i
    end = start + num_words/num_trees
    word_tree = KDTree(all_word_vectors[start:end,:])
    kd_forest.append(word_tree)

# for i in word_tree.query([model['dog']], k=10)[1][0]:
#     print (all_words[i])
    



def alert(alert_contents):
    server = smtplib.SMTP( "smtp.gmail.com", 587 )
    server.starttls()
    server.login( 'processing.update.system@gmail.com', 'updatesystem' )
    server.sendmail( 'processing.update.system@gmail.com', 'ijdykeman@gmail.com', alert_contents )
    
    
seq_max_len = 2
word_vector_width = 504


# @memoize


def get_vectors_for_definition(defitnition):
    definition = map(get_vec, defitnition)[:seq_max_len-1]
    definition.append(model["STOP"])
    padding = np.array([model["PAD"]]*(seq_max_len - len(definition)))
    definition = np.array(definition)
    if len(definition) != seq_max_len:
        definition = np.append(definition, padding, axis = 0)
    return definition

def get_batch(n, val = False):
    words = []
    defs = []
    y = []
    for _ in range(n):
        i = random.randint(0,len(dictionary[0])-1)
        if val:
            i=i%10
        else:
            i += random.randint(1,9)
        i=i%len(dictionary[0])
        word = get_vec(dictionary[0][i])
        definition = get_vectors_for_definition(dictionary[1][i])
        words.append(word)
        defs.append(definition)
        y.append([1,0])

    words = np.array(words)
    defs = np.array(defs)
    y= np.array(y)
    return words, defs, y


created fast dictionary


In [52]:
def get_most_similar(vec, exclude = None):
    topn = [("none", -2)]
    
    words = []
    
    for i, tree in enumerate(kd_forest):
        word = all_words[tree.query([vec], k=1)[1][0] + num_words/len(kd_forest)*i]
        print word
        words.append(word)
    for word in words:
        if word != exclude:
            topn.append((word, cosine_similarity(vec, model[word])))
            topn = sort_topn(topn, 15)
    return topn[0][0]

# def get_most_similar(vec, exclude = None):
#     topn = [("none", -2)]
    
#     words = []
#     for i in kd_forest[0].query([vec], k=2)[1][0]:
#         words.append(all_words[i])
#     for word in words:
#         if word != exclude:
#             topn.append((word, cosine_similarity(vec, model[word])))
#             topn = sort_topn(topn, 15)
#     return topn[0][0]

get_most_similar(model['dog'])

dogs
puppy
pet
kennel
dog
poodle
rabbit


'dog'

In [61]:
vec = model['dog']
def f(i):
    return all_words[kd_forest[i].query([vec], k=1)[1][0] + num_words/len(kd_forest)*i]#[0]

import multiprocessing
pool = multiprocessing.Pool(7)
print pool.map(f, range(len(kd_forest)))

['dogs', 'puppy', 'pet', 'kennel', 'dog', 'poodle', 'rabbit']


In [None]:
import numpy as np
import tensorflow as tf
from collections import deque
import functools
from time import gmtime, strftime
import seaborn as sns
from os import listdir
from os.path import isfile, join
from scipy.stats.stats import pearsonr
import random as rand
import pyproj
import functools
import pickle 
import matplotlib.pyplot as plt
import pandas
import scipy
import os
import gc
from collections import defaultdict
import random
%matplotlib inline

np.random.seed(0)
tf.set_random_seed(0)


    
def memoize(obj, maxlen = 200):
    """A decorator to cache advice objects using the advice key"""
    cache = obj.cache = {}
    deck = obj.deck = deque([], maxlen = maxlen)

    @functools.wraps(obj)
    def memoizer(*args, **kwargs):
        key = args[0]
        if key not in cache:
            if len(deck) == deck.maxlen:
              del cache[deck.popleft()[0]]
            temp = obj(*args, **kwargs)
            cache[key] = temp
            deck.append((key, temp))
        return cache[key]
        
    return memoizer


n_samples = 800000
def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)


with open("../training_data/D_cbow_pdw_8B.pkl", 'rb') as f:
    model = pickle.load(f)
    
with open("../training_data/training_data.pkl", 'rb') as f:
    dictionary = pickle.load(f)

for word in model:
    model[word] = np.append(model[word], [0,0,0,0])
    
model["UNK"] = np.zeros([504])
model["UNK"][500]=1
model["GO"] = np.zeros([504])
model["GO"][501]=1
model["STOP"] = np.zeros([504])
model["STOP"][502]=1
model["PAD"] = np.zeros([504])
model["PAD"][503]=1
    

dictionary_dict = defaultdict(list)
for i in range(len(dictionary[0])):
    dictionary_dict[dictionary[0][i]].append(dictionary[1][i])
print ("created fast dictionary")
    


# print (get_hypernyms_to_adjectives("mansion"))
# print (get_hypernyms_to_adjectives("hut"))
# print (get_hypernyms_to_adjectives("hut"))


all_words = list(set(dictionary[0]))
all_words.extend(["PAD","UNK", "STOP",  "GO"])
    
    
    
words = model.keys()
random.shuffle(words)

train_words = words[:int(len(words) * .75)]
val_words = words[int(len(words) * .75) : ]



def get_vec(word):
    if word in model:
        return model[word]
    else:
        return model['UNK']


def cosine_similarity(a, b):
    return np.dot(a,b)/(np.sqrt(np.sum(np.power(a,2)))*np.sqrt(np.sum(np.power(b,2))))

def sort_topn(l, n):
    return list(reversed(sorted(l, key = lambda x: x[1])))[:n]

all_word_vectors = np.zeros([len(all_words), 504])
i=0
for word in all_words:
    all_word_vectors[i,:] = model[word]
    i+=1

norms = np.linalg.norm(all_word_vectors, ord=2, axis=1)
all_word_vectors = all_word_vectors / norms[:,None]

from scipy.spatial import KDTree
word_tree = KDTree(all_word_vectors)

for i in word_tree.query([model['dog']], k=10)[1][0]:
    print (all_words[i])
    



def alert(alert_contents):
    server = smtplib.SMTP( "smtp.gmail.com", 587 )
    server.starttls()
    server.login( 'processing.update.system@gmail.com', 'updatesystem' )
    server.sendmail( 'processing.update.system@gmail.com', 'ijdykeman@gmail.com', alert_contents )
    
    
seq_max_len = 2
word_vector_width = 504


# @memoize
def get_most_similar(vec, exclude = None):
    topn = [("none", -2)]
    
    words = []
    for i in word_tree.query([vec], k=20)[1][0]:
        words.append(all_words[i])
    for word in words:
        if word != exclude:
            topn.append((word, cosine_similarity(vec, model[word])))
            topn = sort_topn(topn, 15)
    return topn[0][0]

def get_vectors_for_definition(defitnition):
    definition = map(get_vec, defitnition)[:seq_max_len-1]
    definition.append(model["STOP"])
    padding = np.array([model["PAD"]]*(seq_max_len - len(definition)))
    definition = np.array(definition)
    if len(definition) != seq_max_len:
        definition = np.append(definition, padding, axis = 0)
    return definition

def get_batch(n, val = False):
    words = []
    defs = []
    y = []
    for _ in range(n):
        i = random.randint(0,len(dictionary[0])-1)
        if val:
            i=i%10
        else:
            i += random.randint(1,9)
        i=i%len(dictionary[0])
        word = get_vec(dictionary[0][i])
        definition = get_vectors_for_definition(dictionary[1][i])
        words.append(word)
        defs.append(definition)
        y.append([1,0])

    words = np.array(words)
    defs = np.array(defs)
    y= np.array(y)
    return words, defs, y
