In [38]:
#from Vocabscraper import VocabularyScraper
import os
import pickle
import pandas
from copy import deepcopy
from collections import Counter
import re
import nltk
from  nltk.tokenize import word_tokenize as tokenizer
import itertools
import string
import numpy as np
from nltk import ngrams
import tensorflow as tf
import time
import csv
from tensorflow.contrib.tensorboard.plugins import projector

In [39]:
def load_data(preprocessed=False):
    
    ''' Helper function to load dataset '''
    data=None
    FILE_PATH= "words.pkl"
    if not os.path.exists(FILE_PATH):
        cursor = VocabularyScraper(None).db_executer
        cursor.execute("SELECT term,definitions,examples FROM terms")
        data = cursor.fetchall()
        with open(FILE_PATH,"wb") as file:
            pickle.dump(data,file)
    else:
        with open(FILE_PATH,"rb") as file:
            data = pickle.load(file)
    return data



In [40]:
def look_up_word(term,data):
    '''Name of the function is self-explanatory '''
        
    for word in data:
        if word[0]==term:
            return word
    return None    

In [41]:
def find_optimum_length(defs_exs,percentiles):
    
    '''Name of the function is self-explanatory '''
    
    stats = [len(ex) for ex in defs_exs]
    df= pandas.DataFrame(stats,columns=["length of samples"])
    return df.describe(percentiles=percentiles)
    
    
    
    

In [42]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [43]:
def ex_remove_red_chars(*examples):
    
    ''' Function to remove redundant chars , like punctuation or part of speech markers
        Then to split them using "#$" or "&#" delimiters'''
    
    result=[]
    for example in examples:
        if example is not None:
            punctuation=re.sub("[#$&]*","",string.punctuation)
            punctuation = [char for char in punctuation]
            del_patterns = ["&[\w]+&",'”',"^[\s]","“","adj\n","v\n","n\n","adv\n",]
            del_patterns.extend(punctuation)
            for pattern in del_patterns:
                example = example.replace(pattern,"")
            example = example.replace("—"," ")
            example = example.lower()
            example = re.split(r"[&#$]*;*",example) 
            
            result.append(example)
        else:
            result.append("")
    return tuple(result)


In [44]:
def process_duplicates(data):
    ''' Function to avoid storing unneccessary duplicates'''
    
    terms= set()
    defs_exs=set()
   
    
    for word in data:
        terms.add(word[0])
        definitions,examples = word[1],word[2]
        definitions,examples =  ex_remove_red_chars(definitions,examples)
        defs_exs.update(definitions,examples)
        
    return(terms,defs_exs)    


In [45]:
words=load_data()

In [46]:
look_up_word("word",words)

('word',
 'A word is a unit of language that native speakers can agree upon as a separate and distinct unit of meaning. Languages are made up of words. You must like words since you are here, on this word site. &$ If you say "What`s the good word?" you’re not actually asking for a word — you’re asking, "How are you?" or "What`s doing?" But if you ask "What`s the word on that restaurant?" you`re asking for an opinion. You can also "give your word," but that means that you`re promising something; you`re giving an oath. If your friend asks what you think of his girlfriend, you may want to word your response carefully — to watch your words. &$ n\na unit of language that native speakers can identify &$ n\na brief statement &$ n\na verbal command for action &$ n\nan exchange of views on some topic &$ n\na promise &$ n\na secret word or phrase known only to a restricted group &$ n\ninformation about recent and important events &$ n\na word is a string of bits stored in computer memory &$ v\np

In [47]:
terms , defs_exs = process_duplicates(words)


  return _compile(pattern, flags).split(string, maxsplit)


In [48]:
def create_lookup_dicts(terms,defs_and_exs,threshhold_min,threshhold_max):
    
    ''' Function to create helping dictionaries for future word-to-integers (or conversely) conversion'''
    
    all_text = list(itertools.chain(*[sent.split() for sent in list(defs_exs)]))
    counter=Counter(all_text)

    index=0
    word_to_index=dict()
    
    
    
    for key,indx in counter.items():
        if not hasNumbers(key):
            if indx>threshhold_min and indx<threshhold_max:    
                word_to_index[key]=index
                index+=1
        
    #Terms should be treated differently,avoiding split,that's why we add them seperately
    
    for word in terms:
        if word not in word_to_index:
            word_to_index[word]=index
            index+=1
            
    index_to_word=dict()
    
    for key,index in word_to_index.items():
        index_to_word[index]=key
    return (word_to_index,index_to_word)


    
            

In [15]:
hasNumbers("c12h8nh")

True

In [49]:
word_to_index,index_to_word=create_lookup_dicts(terms,defs_exs,1,100000)

In [17]:
def encode(defs_exs,word2int,unknown_int=-2):
    '''Function to encode words to integers'''
    sentences=[]
    for example in defs_exs:
        sentence=[]
        for word in example.split():
            try:
                sentence.append(word2int[word])
            except KeyError:
                sentence.append(unknown_int)
        sentences.append(sentence) 
    encoded=np.array(sentences)
    return encoded
        

In [50]:
encoded=encode(defs_exs,word_to_index)

In [55]:
encoded

array([list([]), list([137662, 52326, -2, 82799, -2, 41095]),
       list([137662, 28923, 137662, 79590]), ...,
       list([242243, 64282, 320712, 242243, 4484, 30993, 21785, 242243, 13371, 320712, 27606, 54956, 137662, 16080, 4315]),
       list([38405, 31993, -2, 152005, 8850, 46304]),
       list([5040, 137662, 242243, 13371, -2, 74011, 85020, 41739, 24709])], dtype=object)

In [20]:
def skipgrams(sequence, n, k):
    ''' Can be deleted , was used for testing'''
    for ngram in ngrams(sequence, n + k, pad_right=True):
        head = ngram[:1]
        tail = ngram[1:]
        for skip_tail in itertools.combinations(tail, n - 1):
            if skip_tail[-1] is None:
                continue
            yield head + skip_tail

In [57]:
word_to_index

{'': 87561,
 'howard': 3,
 'unsugary': 87567,
 'dickensian': 32810,
 'membranophone': 87570,
 'chemical-formula': 87571,
 'splinters': 6,
 'prohibitively': 87574,
 'situation': 9,
 'congestive heart failure': 87575,
 'come the acid': 87578,
 'cognitive neuroscientist': 262798,
 'eft': 11,
 'shoed': 13,
 'metronomy': 87582,
 'ceibhfhionn': 87584,
 'affrontery': 87585,
 'lozenged': 174874,
 'spatting': 55069,
 'tautophony': 87596,
 'blackfoot daisy': 87597,
 'fronted': 49789,
 'ideogrammatic': 87600,
 'picot': 87602,
 'hem and haw': 87604,
 'Nuclear Regulatory Commission': 87606,
 'unbranded': 26,
 'two-dimensionality': 340120,
 'brass monkey': 87608,
 'tabloidize': 87569,
 'niddah': 87610,
 'dictyopterous insect': 87611,
 'unattempted': 87615,
 'Curietherapy': 313625,
 'microsurgically': 87617,
 'chuiwan': 87619,
 'postfactum': 87621,
 'tetradecanoic': 72447,
 'nonhuman': 36,
 'roundbottomed': 37,
 'eutectic mixture': 87624,
 'briartite': 87626,
 'Kendall rank correlation': 87627,
 'Sal

In [58]:

def generate_batch(sequence,window_size,filler=-1):
    
    '''Function which generates a set of batches from int-like sequence
    default argument filler (-1) could be changed accoring to your needs'''
    
    batchesX = []
    batchesY = []
    
    if len(sequence)>window_size: 
        
        for index,num in enumerate(sequence):
            range1 = sequence[index-window_size:index]
            range2 = sequence[index+1:index+window_size+1]

            if len(range1) is not window_size:
                range1 = [filler]*(window_size-len(range1)) + range1

            if len(range2) is not window_size:
                range2 = range2 + [filler]*(window_size-len(range2))

            batchesX.append(range1+range2)
            batchesY.append(num)
            
    else:
        
        return None,None
        
    return (batchesX,batchesY)



                   


In [59]:
def generate_batches(enc_sequences,window_size=3): 
    
    '''Function which iteratevely generate batches through encoded sequence'''
    
    batchesX=[]
    batchesY=[]
    for value in enc_sequences:
        batchX , batchY = generate_batch(value,window_size)
        if batchX is not None:
            batchesX.append(np.array(batchX,dtype=np.int32))
            batchesY.append(np.array(batchY,dtype=np.int32))
            
    return (batchesX,batchesY)
    
        
              
            
    
        
    
    
    




In [60]:
batchesX,batchesY=generate_batches(encoded)

In [61]:
def pregen_equally_sized_batch(batchesX,batchesY,batch_size=256):
    
    '''First part of generator chain in order to save some computation time' and make batches of the same shape'''
    
    batchX,batchY = generate_empty_batches_array()
    
    for batch in zip(batchesX,batchesY):
        for i in range(len(batch[0])):
            X_sample = batch[0][i] 
            Y_sample = batch[1][i]
            batchX.append(X_sample)
            batchY.append(Y_sample)
            if len(batchX)==batch_size:
                
                batchX,batchY = np.array(batchX,dtype=np.int32),np.array(batchY,dtype=np.int32) 
                
                yield (batchX,batchY)
                batchX,batchY = generate_empty_batches_array()
        
     
        
            
    

In [62]:
def gen_batch(batchesX,batchesY,batch_size=256):
    
    batchX,batchY = generate_empty_batches_array()
    pregen = pregen_equally_sized_batch(batchesX,batchesY,batch_size)
    for batch in pregen:
        for index1,center in enumerate(batch[1]):
            for index2,target in enumerate(batch[0][index1]):
                if target!=-1 and target!=-2 and center!=-2:
                    batchX.append(center)
                    batchY.append(target)
                    if len(batchX)==batch_size:
                        indices = np.random.permutation(batch_size)
                        batchX ,batchY= np.array(batchX),np.array(batchY).reshape(-1,1)
                        batchX , batchY = batchX[indices],batchY[indices]
                        yield(batchX,batchY)
                        batchX,batchY = generate_empty_batches_array()
    
   

In [63]:
def generate_empty_batches_array():
    ''' Name of function is self-explanatory'''
    
    batchesX=[]
    batchesY=[]
    
    return (batchesX,batchesY)

In [66]:

class Word2Vec:
    
    def __init__(self,batch_size,word_to_int,embedding_size,learning_rate,training_steps):
        tf.reset_default_graph()
        self.batch_size=batch_size
        self.vocab_len=len(word_to_int)
        self.word_to_int=word_to_int
        self.embedding_size = embedding_size
        self.learning_rate = learning_rate
        self.training_steps=training_steps
        self.init_vars()
       
        
    def init_vars(self):
        
        with tf.name_scope("data"):
            self.center_words = tf.placeholder(dtype=tf.int32,shape=[self.batch_size],name="self.center_words")
            self.target_words = tf.placeholder(dtype=tf.int32,shape=[self.batch_size,1],name="self.target_words")
            self.global_step = tf.Variable(0,trainable=False,name="global_step")
            
        with tf.name_scope("embed"):
            self.embedding_matrix = tf.Variable(tf.random_uniform([self.vocab_len,self.embedding_size],-1.0,1.0),name="embedding_matrix")
           
        with tf.name_scope("loss"):
            self.embedding = tf.nn.embedding_lookup(self.embedding_matrix,self.center_words,name="embed")
            self.nce_weights = tf.Variable(tf.truncated_normal([self.vocab_len,self.embedding_size],stddev=1/(self.embedding_size**0.5)),
                                      name="nce_weights")
            self.nce_biases = tf.Variable(tf.zeros([self.vocab_len]),name="nce_biases")
            
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weights, 
                                            biases=self.nce_biases, 
                                            labels=self.target_words, 
                                            inputs=self.embedding, 
                                            num_sampled=64, 
                                            num_classes=self.vocab_len), name='loss')
                           
                                        
        with tf.name_scope("summary"):
            tb_loss=tf.summary.scalar("loss",self.loss)
            tb_hs_loss=tf.summary.histogram("histogram_loss",self.loss)
            self.summary_op = tf.summary.merge([tb_loss,tb_hs_loss])
                                     
                                     
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss,global_step=self.global_step)
        
        
        
        
        self.batch_gen=gen_batch(batchesX,batchesY,self.batch_size)
        self.average_loss=0.0
        self.saver = tf.train.Saver()
       
    def visualize_embeddings(self):
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())

                self.saver.restore(sess,save_path="checkpoint_dir/word2vec-283331")
                for v in tf.trainable_variables():

                    if v.name=="embed/embedding_matrix:0":
                        result=v


                final_embed_matrix = sess.run(result)
                print(final_embed_matrix)


                # # it has to variable. constants don't work here. you can't reuse model.embed_matrix
                embedding_var = tf.Variable(final_embed_matrix, name='embedding')
                sess.run(embedding_var.initializer)

                config = projector.ProjectorConfig()
                summary_writer = tf.summary.FileWriter('processed')

                 # add embedding to the config file
                embedding = config.embeddings.add()
                embedding.tensor_name = embedding_var.name

                # # link this tensor to its metadata file, in this case the first 500 words of vocab
                embedding.metadata_path = 'vocab_all_words.tsv'

               # saves a configuration file that TensorBoard will read during startup.
                projector.visualize_embeddings(summary_writer, config)
                saver_embed = tf.train.Saver([embedding_var])
                saver_embed.save(sess, 'processed/model.ckpt', 1)
                summary_writer.close()

                
    def save(self,sess,global_step):
        directory = "checkpoint_dir/word2vec"
        self.saver.save(sess,directory,global_step)
        return global_step
        

   

    def execute_training(self):
            
            with tf.Session() as sess:

                sess.run(tf.global_variables_initializer())
                time0 = time.time()
                self.writer=tf.summary.FileWriter("./stats",sess.graph)

                for i in range(self.training_steps):

                    try:
                        batch = next(self.batch_gen)
                        center_word = batch[0]
                        target_word = batch[1]
                    except StopIteration:
                        self.global_step=self.save(sess,i)
                        self.writer.close()
                        break

                    feed_dict = {self.center_words:center_word,self.target_words:target_word}

                    loss_get,_,summary= sess.run([self.loss,self.optimizer,self.summary_op],feed_dict = feed_dict)

                    self.average_loss+=loss_get
                    self.writer.add_summary(summary,global_step = i)
                    

                    if (i+1) % 10000==0:
                        time1 = time.time()
                        print("Average loss at timestep {} is {:5.1f} , time to calculate {} s ".format(i+1,
                                                                           self.average_loss/i+1,round(time1-time0,3)))
                        time0 =time.time()
                    
                    if (i+1) %self.training_steps==0:
                        self.global_step = self.save(sess,i)



                self.writer.close()
                
                
    def load_words_to_file(self):
           
            FILE_PATH = "processed/vocab_all_words.tsv" 
            
            if not os.path.exists(FILE_PATH):
                words = self.word_to_int.keys()
                with open(FILE_PATH,"w") as file:
                       for word in words:
                            print(word,file=file)
                        


            

                          

In [67]:
model = Word2Vec(batch_size=64,embedding_size=128,word_to_int=word_to_index,learning_rate=2.0,training_steps=800000)

In [68]:
model.execute_training()

Average loss at timestep 10000 is 144.6 , time to calculate 29.023 s 
Average loss at timestep 20000 is 115.6 , time to calculate 25.849 s 
Average loss at timestep 30000 is  98.8 , time to calculate 33.087 s 
Average loss at timestep 40000 is  87.3 , time to calculate 34.543 s 
Average loss at timestep 50000 is  78.6 , time to calculate 35.67 s 
Average loss at timestep 60000 is  71.9 , time to calculate 29.793 s 
Average loss at timestep 70000 is  66.3 , time to calculate 32.281 s 
Average loss at timestep 80000 is  61.9 , time to calculate 26.501 s 
Average loss at timestep 90000 is  58.4 , time to calculate 28.757 s 
Average loss at timestep 100000 is  55.6 , time to calculate 32.926 s 
Average loss at timestep 110000 is  53.2 , time to calculate 32.783 s 
Average loss at timestep 120000 is  50.9 , time to calculate 29.787 s 
Average loss at timestep 130000 is  48.9 , time to calculate 32.295 s 
Average loss at timestep 140000 is  47.1 , time to calculate 31.933 s 
Average loss at 

In [30]:
model.load_words_to_file()

In [33]:
model.visualize_embeddings()

INFO:tensorflow:Restoring parameters from checkpoint_dir/word2vec-283331
[[ 0.06376919 -1.6537497  -1.26960015 ..., -0.82139474 -1.40726221
  -0.59549248]
 [-0.93887901  0.22774372 -0.82737404 ..., -0.16128071 -1.50214553
  -0.50000304]
 [ 0.03260875 -0.23994614  0.22925663 ..., -0.10199402 -0.9900254
  -0.12052806]
 ..., 
 [-0.28072786  0.28797293 -0.94665623 ..., -0.88811731  0.5270896
  -0.0510006 ]
 [-0.12864733 -0.41069007 -0.44026446 ...,  0.28916502  0.83757448
   0.17500448]
 [-0.92102695 -0.35959029  0.70663214 ...,  0.84626937  0.05432224
  -0.6420176 ]]


In [273]:
gen = gen_batch(batchesX,batchesY)



In [309]:
model.visualize_embeddings = visualize_embeddings

In [31]:
word_to_index

{'': 87560,
 'embulk': 87561,
 'incompetency': 0,
 'Decumaria barbara': 87563,
 'handshake': 4,
 'pazinaclone': 87570,
 'preservatives': 43884,
 'clearance': 8,
 'claiming': 51215,
 'podder': 87574,
 'Malawi': 262441,
 'escape valve': 87576,
 'Fritz W. Meissner': 87565,
 'bp': 65936,
 'change-of-life': 87581,
 'disapprovingly': 14,
 'conveys': 16,
 'befly': 87585,
 'nonsepticemic': 87589,
 'decadently': 87587,
 'barberiite': 87596,
 'vitreous humor': 87597,
 'egad': 87599,
 'US Post Office': 87600,
 'contractedly': 87604,
 'frog`s-bit': 87606,
 'roosevelt': 31,
 'nonearthbound': 87567,
 'Transylvania': 87607,
 'crested wheatgrass': 87610,
 'hermaphroditic': 81609,
 'parkinson': 87614,
 'inserted': 36,
 'twenty percent': 87617,
 'decrepit': 42,
 'sebastes': 44,
 'take in charge': 87623,
 'surface fire': 87624,
 'heptadecenoic': 45,
 'intermembranous': 87628,
 'honeylike': 55904,
 'outthink': 87631,
 'fila': 87573,
 'goalscorer': 23751,
 'temporary state': 87635,
 'leaderette': 87637,
 '