In [1]:
from Vocabscraper import VocabularyScraper
import os
import pickle
import pandas
from copy import deepcopy
from collections import Counter
import re
import nltk
from  nltk.tokenize import word_tokenize as tokenizer
import itertools
import string
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from nltk import ngrams
import tensorflow as tf


Using TensorFlow backend.


In [2]:
def load_data(preprocessed=False):
    
    ''' Helper function to load dataset '''
    data=None
    FILE_PATH= "words.pkl"
    if not os.path.exists(FILE_PATH):
        cursor = VocabularyScraper(None).db_executer
        cursor.execute("SELECT term,definitions,examples FROM terms")
        data = cursor.fetchall()
        with open(FILE_PATH,"wb") as file:
            pickle.dump(data,file)
    else:
        with open(FILE_PATH,"rb") as file:
            data = pickle.load(file)
    return data



In [3]:
def look_up_word(term,data):
    '''Name of the function is self-explanatory '''
        
    for word in data:
        if word[0]==term:
            return word
    return None    

In [4]:
def find_optimum_length(defs_exs,percentiles):
    
    '''Name of the function is self-explanatory '''
    
    stats = [len(ex) for ex in defs_exs]
    df= pandas.DataFrame(stats,columns=["length of samples"])
    return df.describe(percentiles=percentiles)
    
    
    
    

In [5]:
def ex_remove_red_chars(*examples):
    
    ''' Function to remove redundant chars , like punctuation or part of speech markers
        Then to split them using "#$" or "&#" delimiters'''
    
    result=[]
    for example in examples:
        if example is not None:
            punctuation=re.sub("[#$&]*","",string.punctuation)
            punctuation = [char for char in punctuation]
            del_patterns = ["&[\w]+&",'”',"^[\s]","“","adj\n","v\n","n\n","adv\n"]
            del_patterns.extend(punctuation)
            for pattern in del_patterns:
                example = example.replace(pattern,"")
            example = re.split(r"[&#$]*;*",example) 
            
            result.append(example)
        else:
            result.append("")
    return tuple(result)


In [6]:
def process_duplicates(data):
    ''' Function to avoid storing unneccessary duplicates'''
    
    terms= set()
    defs_exs=set()
   
    
    for word in data:
        terms.add(word[0])
        definitions,examples = word[1],word[2]
        definitions,examples =  ex_remove_red_chars(definitions,examples)
        defs_exs.update(definitions,examples)
        
    return(terms,defs_exs)    


In [7]:
words=load_data()

In [8]:
terms , defs_exs = process_duplicates(words)


  return _compile(pattern, flags).split(string, maxsplit)


In [9]:
def create_lookup_dicts(terms,defs_and_exs):
    
    ''' Function to create helping dictionaries for future word-to-integers (or conversely) conversion'''
    
    all_text = list(itertools.chain(*[sent.split() for sent in list(defs_exs)]))
    counter=Counter(all_text)
    index=0
    word_to_index=dict()
    
    for key,indx in counter.items():
        word_to_index[key]=index
        index+=1
        
    #Terms should be treated differently,avoiding split,that's why we add them seperately
    
    for word in terms:
        if word not in word_to_index:
            word_to_index[word]=index
            index+=1
            
    index_to_word=dict()
    
    for key,index in word_to_index.items():
        index_to_word[index]=key
    return (word_to_index,index_to_word)    
            

In [10]:
word_to_index,index_to_word = create_lookup_dicts(terms,defs_exs)

In [11]:
look_up_word("god",words)

('god',
 'A god is a supreme being or deity, and it`s spelled with a lowercase g when you`re not referring to the God of Christian, Jewish, or Muslim tradition. The ancient Greeks had many gods — including Zeus, Apollo, and Poseidon. &$ A physical representation of a deity is also called a god. If you go to Hawaii, you can even buy a god in a gift shop — a statue or idol that represents one of the Hawaiian gods, like a figure of the god Pele. The word god also refers to a man of superior quality or exceptional beauty. Elvis Presley was considered a god by many teenage girls in the late 1950s. &$ n\nany supernatural being worshipped as controlling some part of the world or some aspect of life or who is the personification of a force &$ n\na man of such superior qualities that he seems like a deity to other people &$ n\na material effigy that is worshipped &$ n\nthe supernatural being conceived as the perfect and omnipotent and omniscient originator and ruler of the universe; the object 

In [12]:
def encode(defs_exs,word2int):
    '''Function to encode words to integers'''
    sentences=[]
    for example in defs_exs:
        sentence=[]
        for word in example.split():
            sentence.append(word2int[word])
        sentences.append(sentence) 
    encoded=np.array(sentences)
    return encoded
        

In [13]:
encoded=encode(defs_exs,word_to_index)

In [14]:
encoded

array([[], [0, 1, 2, 3, 4, 5],
       [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], ...,
       [547, 9, 659, 50, 8422, 136477, 40090, 56, 2266, 157, 3773, 141, 7692, 26436],
       [9316, 2754, 24958, 302, 2322, 1987, 3973, 5169],
       [1851, 4490, 13909, 6010, 56, 0, 9073, 96793, 0, 9073, 867]], dtype=object)

In [15]:
def skipgrams(sequence, n, k):
    ''' Can be deleted , was used for testing'''
    for ngram in ngrams(sequence, n + k, pad_right=True):
        head = ngram[:1]
        tail = ngram[1:]
        for skip_tail in itertools.combinations(tail, n - 1):
            if skip_tail[-1] is None:
                continue
            yield head + skip_tail

In [16]:
encoded[:10]

array([[], [0, 1, 2, 3, 4, 5],
       [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [15, 7, 20, 21, 22, 0, 23],
       [0, 24, 25, 7, 26, 27, 28, 29, 30, 28, 31, 32, 33, 34],
       [35, 36, 13, 37, 38, 39, 40, 41, 42, 0, 43, 44, 36, 45, 46, 47, 48, 49, 50, 51, 13, 43, 50, 52, 53, 54, 55, 56, 57, 9, 13, 58, 59, 60, 61],
       [62, 63, 64, 65, 66],
       [13, 67, 38, 68, 69, 15, 70, 71, 72, 32, 13, 73, 38, 74, 75, 76, 50, 77],
       [78, 79, 80, 60, 81, 82, 60, 83, 9, 84], [0, 85, 9, 86, 38, 87]], dtype=object)

In [17]:

def generate_batch(sequence,window_size,filler=-1):
    
    '''Function which generates a set of batches from int-like sequence
    default argument filler (-1) could be changed accoring to your needs'''
    
    batchesX = []
    batchesY = []
    
    if len(sequence)>window_size: 
        
        for index,num in enumerate(sequence):
            range1 = sequence[index-window_size:index]
            range2 = sequence[index+1:index+window_size+1]

            if len(range1) is not window_size:
                range1 = [filler]*(window_size-len(range1)) + range1

            if len(range2) is not window_size:
                range2 = range2 + [filler]*(window_size-len(range2))

            batchesX.append(range1+range2)
            batchesY.append(num)
            
    else:
        
        return None,None
        
    return (batchesX,batchesY)



                   


In [18]:
def generate_batches(enc_sequences,window_size=3): 
    
    '''Function which iteratevely generate batches through encoded sequence'''
    
    batchesX=[]
    batchesY=[]
    for value in enc_sequences:
        batchX , batchY = generate_batch(value,window_size)
        if batchX is not None:
            batchesX.append(np.array(batchX,dtype=np.int32))
            batchesY.append(np.array(batchY,dtype=np.int32))
            
    return (batchesX,batchesY)
    
        
              
            
    
        
    
    
    




In [19]:
batchesX,batchesY=generate_batches(encoded)

In [20]:
def gen_batch(batchesX,batchesY,batch_size=256):
    
    '''Batch generator in order to save some computation time'''
    batches=generate_empty_2D_batch_array()
    for batch in zip(batchesX,batchesY):
        for i in range(len(batch[0])):
            X_sample = batch[0][i] 
            Y_sample = batch[1][i]
            one_batch = np.array([[X_sample,Y_sample]])
            batches=np.append(batches,one_batch,axis=0)
            if len(batches)==batch_size:
                yield batches
                batches=generate_empty_2D_batch_array()
        
     
        
            
    

In [21]:
def generate_empty_2D_batch_array():
    ''' Name of function is self-explanatory'''
    
    arr = np.array([])
    arr = arr.reshape(-1,2)
    return arr

In [32]:

class Word2Vec:
    
    def __init__(self,batch_size,word_to_int,embedding_size,learning_rate,training_steps):
        
        self.batch_size=batch_size
        self.vocab_len=len(word_to_int)
        self.embedding_size = embedding_size
        self.learning_rate = learning_rate
        self.training_steps=training_steps
       
        
    def train_word2vec(self):
        with tf.name_scope("data"):
            center_words = tf.placeholder(dtype=tf.int32,shape=[self.batch_size],name="center_words")
            target_words = tf.placeholder(dtype=tf.int32,shape=[self.batch_size,1],name="target_words")
        with tf.name_scope("embed"):
            embedding_matrix = tf.Variable(tf.random_uniform([self.vocab_len,self.embedding_size],-1.0,1.0),name="embedding_matrix")
            
        with tf.name_scope("loss"):
            embedding = tf.nn.embedding_lookup(embedding_matrix,center_words,name="embed")
            nce_weights = tf.Variable(tf.truncated_normal([self.vocab_len,self.embedding_size],stddev=1/(self.embedding_size**0.5)),
                                      name="nce_weights")
            nce_biases = tf.Variable(tf.zeros([self.vocab_len]),name="nce_biases")
            
            loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, 
                                            biases=nce_biases, 
                                            labels=target_words, 
                                            inputs=embedding, 
                                            num_sampled=64, 
                                            num_classes=self.vocab_len), name='loss')
                                     
                                     
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(loss)
        
        
        
        
        batch_gen = gen_batch(batchesX,batchesY,self.batch_size)
        average_loss=0.0
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for i in range(self.training_steps):
                batch = next(batch_gen)
                center_word = batch[:,1]
                target_word = batch[:,0]
                loss_get,_ = sess.run([loss,optimizer],feed_dict={center_words:center_word,
                                                              target_words:target_word})
                average_loss+=loss_get
                
                if (i+1) % 100==0:
                    print("Average loss at timestep {} is {:5.1f} ".format(i+1,
                                                                           average_loss/i+1))
                          

In [30]:
model = Word2Vec(batch_size=256,embedding_size=300,word_to_int=word_to_index,learning_rate=0.001,training_steps=1000)

In [31]:
model.train_word2vec()

ValueError: Cannot feed value of shape (256, 6) for Tensor 'data_2/target_words:0', which has shape '(256, 1)'

In [None]:
batches

In [34]:
gen=gen_batch(batchesX,batchesY)

In [35]:
val=next(gen)

In [56]:
np.array(val[:,0].tolist())

array([[ -1,  -1,  -1,   1,   2,   3],
       [ -1,  -1,  -1,   2,   3,   4],
       [ -1,  -1,  -1,   3,   4,   5],
       ..., 
       [ 38, 175, 176,  13, 177, 178],
       [175, 176,  38, 177, 178, 112],
       [176,  38,  13, 178, 112, 179]], dtype=int32)

In [86]:
assert all(x.shape == (6,) for x in val[:,0])