In [1]:
from Vocabscraper import VocabularyScraper
import os
import pickle
import pandas
from copy import deepcopy
from collections import Counter
import re
import nltk
from  nltk.tokenize import word_tokenize as tokenizer
import itertools
import string
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from nltk import ngrams


Using TensorFlow backend.


In [20]:
def load_data():
    
    ''' Helper function to load dataset '''
    data=None
    FILE_PATH= "words.pkl"
    if not os.path.exists(FILE_PATH):
        cursor = VocabularyScraper(None).db_executer
        cursor.execute("SELECT term,definitions,examples FROM terms")
        data = cursor.fetchall()
        with open(FILE_PATH,"wb") as file:
            pickle.dump(data,file)
    else:
        with open(FILE_PATH,"rb") as file:
            data = pickle.load(file)
    return data



In [49]:
def look_up_word(term,data):
    '''Name of the function is self-explanatory '''
        
    for word in data:
        if word[0]==term:
            return word
    return None    

In [71]:
def find_optimum_length(defs_exs,percentiles):
    
    '''Name of the function is self-explanatory '''
    
    stats = [len(ex) for ex in defs_exs]
    df= pandas.DataFrame(stats,columns=["length of samples"])
    return df.describe(percentiles=percentiles)
    
    
    
    

In [41]:
def ex_remove_red_chars(*examples):
    
    ''' Function to remove redundant chars , like punctuation or part of speech markers
        Then to split them using "#$" or "&#" delimiters'''
    
    result=[]
    for example in examples:
        if example is not None:
            punctuation=re.sub("[#$&]*","",string.punctuation)
            punctuation = [char for char in punctuation]
            del_patterns = ["&[\w]+&",'”',"^[\s]","“","adj\n","v\n","n\n","adv\n"]
            del_patterns.extend(punctuation)
            for pattern in del_patterns:
                example = example.replace(pattern,"")
            example = re.split(r"[&#$]*;*",example) 
            
            result.append(example)
        else:
            result.append("")
    return tuple(result)


In [42]:
def process_duplicates(data):
    ''' Function to avoid storing unneccessary duplicates'''
    
    terms= set()
    defs_exs=set()
   
    
    for word in data:
        terms.add(word[0])
        definitions,examples = word[1],word[2]
        definitions,examples =  ex_remove_red_chars(definitions,examples)
        defs_exs.update(definitions,examples)
        
    return(terms,defs_exs)    


In [35]:
terms , defs_exs = process_duplicates(words)


  return _compile(pattern, flags).split(string, maxsplit)


In [57]:
def create_lookup_dicts(terms,defs_and_exs):
    
    ''' Function to create helping dictionaries for future word-to-integers (or conversely) conversion'''
    
    all_text = list(itertools.chain(*[sent.split() for sent in list(defs_exs)]))
    counter=Counter(all_text)
    index=0
    word_to_index=dict()
    
    for key,indx in counter.items():
        word_to_index[key]=index
        index+=1
        
    #Terms should be treated differently,avoiding split,that's why we add them seperately
    
    for word in terms:
        if word not in word_to_index:
            word_to_index[word]=index
            index+=1
            
    index_to_word=dict()
    
    for key,index in word_to_index.items():
        index_to_word[index]=key
    return (word_to_index,index_to_word)    
            

In [59]:
word_to_index,index_to_word = create_lookup_dicts(terms,defs_exs)

In [52]:
look_up_word("person",words)

('person',
 'a human body (usually including the clothing &$ A single human being; an individual &$ A linguistic category used to distinguish between the speaker of an utterance and those to whom or about whom he is speaking. See grammatical person &$ one of three relations or conditions (that of speaking, that of being spoken to, and that of being spoken of) pertaining to a noun or a pronoun, and thence also to the verb of which it may be the subject &$ a living, self-conscious being, as distinct from an animal or a thing; a moral agent; a human being; a man, woman, or child &$ Someone who likes or has an affinity for (a specified thing &$ a character or part, as in a play; a specific kind or manifestation of individual character, whether in real life, or in literary or dramatic representation; an assumed character &$ the bodily form of a human being; body; outward appearance; as, of comely person &$ a grammatical category used in the classification of pronouns, possessive determiners

In [61]:
def encode(defs_exs,word2int):
    '''Function to encode words to integers'''
    sentences=[]
    for example in defs_exs:
        sentence=[]
        for word in example.split():
            sentence.append(word2int[word])
        sentences.append(sentence) 
    encoded=np.array(sentences)
    return encoded
        

In [62]:
encoded=encode(defs_exs,word_to_index)

In [63]:
encoded

array([[], [0, 1, 2, 3], [4, 5, 6, 7, 8, 9, 10, 11], ...,
       [238, 13, 16695, 2048, 163, 244, 13, 18, 6117, 163, 3185, 183800],
       [9998, 9, 5047, 34, 7182, 14323], [71499, 79241, 2517]], dtype=object)

In [74]:
def skipgrams(sequence, n, k):
    ''' Can be deleted , was used for testing'''
    for ngram in ngrams(sequence, n + k, pad_right=True):
        head = ngram[:1]
        tail = ngram[1:]
        for skip_tail in itertools.combinations(tail, n - 1):
            if skip_tail[-1] is None:
                continue
            yield head + skip_tail

In [174]:
encoded[:10]

array([[], [0, 1, 2, 3], [4, 5, 6, 7, 8, 9, 10, 11],
       [12, 13, 14, 15, 13, 16, 17, 13, 18, 19, 20, 21, 22, 23, 24, 25],
       [26, 27, 28, 29, 30, 31, 14, 32, 33, 34, 35],
       [4, 36, 37, 38, 14, 39, 40, 26, 41, 14, 42], [43, 9, 44, 14, 45],
       [27, 46, 47, 48, 13, 49, 50],
       [51, 14, 52, 53, 34, 54, 14, 55, 9, 56, 57], [58, 59, 60, 18, 61]], dtype=object)

In [188]:

def generate_batch(sequence,window_size,filler=-1):
    
    '''Function which generates a set of batches from int-like sequence
    default argument filler (-1) could be changed accoring to your needs'''
    
    batchesX = []
    batchesY = []
    
    if len(sequence)>window_size: 
        
        for index,num in enumerate(sequence):
            range1 = sequence[index-window_size:index]
            range2 = sequence[index+1:index+window_size+1]

            if len(range1) is not window_size:
                range1 = [filler]*(window_size-len(range1)) + range1

            if len(range2) is not window_size:
                range2 = range2 + [filler]*(window_size-len(range2))

            batchesX.append(range1+range2)
            batchesY.append(num)
            
    else:
        
        return None,None
        
    return (batchesX,batchesY)



                   


In [230]:
def get_batches(enc_sequence,window_size=3): 
    
    '''Function which iteratevely generate batches through encoded sequence'''
    
    batchesX=[]
    batchesY=[]
    
    for value in enc_sequence:
       
        batchX , batchY = generate_batch(value,window_size)
        
        
        if batchX is not None:
            batchesX.append(np.array(batchX))
            batchesY.append(np.array(batchY))
            
    return (np.array(batchesX),np.array(batchesY))
    
        
              
            
    
        
    
    
    




In [231]:
batchesX,batchesY=get_batches(encoded)

In [247]:
batchesX

array([ array([[-1, -1, -1,  1,  2,  3],
       [-1, -1, -1,  2,  3, -1],
       [-1, -1, -1,  3, -1, -1],
       [ 0,  1,  2, -1, -1, -1]]),
       array([[-1, -1, -1,  5,  6,  7],
       [-1, -1, -1,  6,  7,  8],
       [-1, -1, -1,  7,  8,  9],
       [ 4,  5,  6,  8,  9, 10],
       [ 5,  6,  7,  9, 10, 11],
       [ 6,  7,  8, 10, 11, -1],
       [ 7,  8,  9, 11, -1, -1],
       [ 8,  9, 10, -1, -1, -1]]),
       array([[-1, -1, -1, 13, 14, 15],
       [-1, -1, -1, 14, 15, 13],
       [-1, -1, -1, 15, 13, 16],
       [12, 13, 14, 13, 16, 17],
       [13, 14, 15, 16, 17, 13],
       [14, 15, 13, 17, 13, 18],
       [15, 13, 16, 13, 18, 19],
       [13, 16, 17, 18, 19, 20],
       [16, 17, 13, 19, 20, 21],
       [17, 13, 18, 20, 21, 22],
       [13, 18, 19, 21, 22, 23],
       [18, 19, 20, 22, 23, 24],
       [19, 20, 21, 23, 24, 25],
       [20, 21, 22, 24, 25, -1],
       [21, 22, 23, 25, -1, -1],
       [22, 23, 24, -1, -1, -1]]),
       ...,
       array([[   -1,    -1,    -1, 

In [150]:
%timeit list(range(10)).append(list(range(10)))

1000000 loops, best of 3: 1.55 µs per loop


In [229]:

%timeit 

1 loop, best of 3: 4.14 s per loop
