In [1]:
import numpy as np
from __future__ import division

filename = 'glove.6B.50d.txt' 
# (glove data set from: https://nlp.stanford.edu/projects/glove/)

#filename = 'numberbatch-en.txt'
#(https://github.com/commonsense/conceptnet-numberbatch)

def loadembeddings(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('Word vector embeddings Loaded.')
    file.close()
    return vocab,embd

# Pre-trained word embedding
vocab,embd = loadembeddings(filename)

word_vec_dim = len(embd[0]) # word_vec_dim = dimension of each word vectors

e = np.zeros((word_vec_dim,),np.float32)+0.0001

vocab.append('<UNK>') #<UNK> represents unknown word
embdunk = np.asarray(embd[vocab.index('unk')],np.float32)+e
    
vocab.append('<EOS>') #<EOS> represents end of sentence
embdeos = np.asarray(embd[vocab.index('eos')],np.float32)+e

vocab.append('<PAD>') #<PAD> represents paddings

flag1=0
flag2=0

for vec in embd:
    
    if np.all(np.equal(np.asarray(vec,np.float32),embdunk)):
        flag1=1
        print "FLAG1"   
    if np.all(np.equal(np.asarray(vec,np.float32),embdeos)):
        flag2=1
        print "FLAG2"

if flag1==0:
    embd.append(embdunk)  
if flag2 == 0:
    embd.append(embdeos)  
    
embdpad = np.zeros(word_vec_dim)
embd.append(embdpad)

embedding = np.asarray(embd)
embedding = embedding.astype(np.float32)

Word vector embeddings Loaded.


In [2]:
def word2vec(word):  # converts a given word into its vector representation
    if word in vocab:
        return embedding[vocab.index(word)]
    else:
        return embedding[vocab.index('<UNK>')]


In [3]:
def most_similar_cosine(x):
    #embed = embedding[0:len(embedding)-1]
    embed = embedding
    xdoty = np.multiply(embed,x) #element-wise
    xdoty = np.sum(xdoty,1)
    xlen = np.square(x)
    xlen = np.sum(xlen,0)
    xlen = np.sqrt(xlen)
    ylen = np.square(embed)
    ylen = np.sum(ylen,1)
    ylen = np.sqrt(ylen)
    xlenylen = np.multiply(xlen,ylen)
    cosine_similarities = np.divide(xdoty,xlenylen)
    return np.flip(np.argsort(cosine_similarities),0)

def most_similar_eucli(x):
    xminusy = np.subtract(embedding,x)
    sq_xminusy = np.square(xminusy)
    sum_sq_xminusy = np.sum(sq_xminusy,1)
    eucli_dists = np.sqrt(sum_sq_xminusy)
    return np.argsort(eucli_dists)

word = 'frog'

most_similars = most_similar_eucli(word2vec(word))

print "TOP TEN MOST SIMILAR WORDS TO '"+str(word)+"':\n"
for i in xrange(0,10):
    print str(i+1)+". "+str(vocab[most_similars[i]])
    

TOP TEN MOST SIMILAR WORDS TO 'frog':

1. frog
2. snake
3. ape
4. toad
5. monkey
6. spider
7. lizard
8. tarantula
9. cat
10. spiny


In [4]:
def vec2word(vec):   # converts a given vector representation into the represented word 
    most_similars = most_similar_eucli(np.asarray(vec,np.float32))
    return vocab[most_similars[0]]

In [5]:
import csv
import nltk
from nltk import word_tokenize
import string

summaries = []
texts = []

def clean(text):
    text = text.lower()
    printable = set(string.printable)
    text = filter(lambda x: x in printable, text) #filter funny characters, if any.
    text = text.translate(None, string.punctuation)
    return text

max_data = 100000
with open('Reviews.csv', 'rb') as csvfile: #Data from https://www.kaggle.com/snap/amazon-fine-food-reviews
    Reviews = csv.DictReader(csvfile)
    count=0
    for row in Reviews:
        if count<max_data:
            clean_text = clean(row['Text'])
            clean_summary = clean(row['Summary'])
            summaries.append(word_tokenize(clean_summary))
            texts.append(word_tokenize(clean_text))
            count+=1

In [6]:
i = 0
texts_v2 = []
summaries_v2 = []

max_len_text = 80
max_len_sum = 4
for text in texts:
    if(len(text)<=max_len_text and len(summaries[i])<=max_len_sum): 
        #remove data pairs with review length more than max_len_text
        #or summary length more than max_len_sum
        texts_v2.append(text)
        summaries_v2.append(summaries[i])
    i+=1
    
print "Current size of data: "+str(len(texts_v2))

Current size of data: 48478


In [8]:
i = 0
texts_v3 = []
summaries_v3 = []

for summary in summaries_v2:
    flag = 0    
    for word in summary:
        if word not in vocab:
            flag = 1
            
    #Remove summary and its corresponding text 
    #if out of vocabulary word present in summary
    
    if flag == 0:
        summaries_v3.append(summary)
        texts_v3.append(texts_v2[i])
    i+=1

print "Current size of data: "+str(len(texts_v3))

Current size of data: 44413


In [9]:
#REDUCE DATA (FOR SPEEDING UP THE NEXT STEPS)

MAXIMUM_DATA_NUM = 20000

texts = texts_v3[0:MAXIMUM_DATA_NUM]
summaries = summaries_v3[0:MAXIMUM_DATA_NUM]

In [10]:
import random

index = random.randint(0,len(texts)-1)

print "SAMPLE CLEANED & TOKENIZED TEXT: \n\n"+str(texts[index])
print "\nSAMPLE CLEANED & TOKENIZED SUMMARY: \n\n"+str(summaries[index])

SAMPLE CLEANED & TOKENIZED TEXT: 

['our', 'boston', 'terrier', 'loves', 'these', 'bones', 'we', 'give', 'them', 'to', 'her', 'as', 'a', 'treat', 'or', 'to', 'keep', 'her', 'busy', 'when', 'we', 'have', 'company', 'for', 'a', '16', 'lbs', 'dog', 'shes', 'a', 'mighty', 'chewer', 'and', 'these', 'last', 'her', 'a', 'couple', 'of', 'hours', 'with', 'breaks', 'to', 'investigate', 'if', 'shes', 'missing', 'anything', 'well', 'buy', 'more', 'of', 'these']

SAMPLE CLEANED & TOKENIZED SUMMARY: 

['chloe', 'loves', 'them']


In [11]:
vocab_limit = []
i=0
for text in texts:
    for word in text:
        if word not in vocab_limit:
            if word in vocab:
                vocab_limit.append(word)

In [12]:
for summary in summaries:
    for word in summary:
        if word not in vocab_limit:
            if word in vocab:
                vocab_limit.append(word)

In [13]:
vocab_limit.append('<EOS>')
vocab_limit.append('<UNK>')
vocab_limit.append('<PAD>') 

In [14]:
lentexts = []

i=0
for text in texts:
    lentexts.append(len(text))
    i+=1
    
sortedindex = np.argsort(lentexts)
#sort indexes according to the sequence length of corresponding texts. 

In [15]:
batch_size = 50

bi=0

batches_x = []
batches_y = []
batch_x = []
batch_y = []

for i in xrange(0,len(texts)):
    
    if bi>=batch_size:
        bi=0
        batches_x.append(batch_x)
        batches_y.append(batch_y)
        batch_x = []
        batch_y = []
        
    batch_x.append(texts[int(sortedindex[i])])
    batch_y.append(summaries[int(sortedindex[i])])
    
    bi+=1
    

In [16]:
import math

vec_batches_x = []
vec_batches_x_pe = []

for batch in batches_x:
 
    max_len_x = len(batch[batch_size-1])
    vec_texts = []
    vec_texts_pe = []
    
    for text in batch:
        
        vec_text=[]
        vec_text_pe = []
    
        pos=0
        
        for word in text:
            
            pe = np.zeros((word_vec_dim,),np.float32)
            #positional encoding
            
            for i in xrange(0,word_vec_dim):
                pe[i] = math.sin(pos/math.pow(10000,(2*i/word_vec_dim)))
            
            vec_text.append(word2vec(word))
            
            ped = np.asarray(word2vec(word),np.float32) + pe
            
            vec_text_pe.append(ped)
            
            pos=pos+1
        
        n = len(vec_text)
        
        while n<max_len_x:
            
            vec_text.append(word2vec('<PAD>'))
            vec_text_pe.append(word2vec('<PAD>'))
            n = len(vec_text)
        
        vec_texts.append(vec_text)
        vec_texts_pe.append(vec_text_pe)
    
    vec_texts = np.asarray(vec_texts,np.float32)
    vec_batches_x.append(vec_texts)
    
    vec_texts_pe = np.asarray(vec_texts_pe,np.float32)
    vec_batches_x_pe.append(vec_texts_pe)
    

In [17]:
vec_batches_y = []
#vec_batches_y_pe = []

#k=0
for batch in batches_y:

    max_len_y = max_len_sum+1
    vec_summaries = []

    for summary in batch:
        
        vec_summary=[]
        for word in summary:
            vec_summary.append(word2vec(word))
        
        vec_summary.append(word2vec('<EOS>'))
        
        n = len(vec_summary)

        while n<max_len_y:
            vec_summary.append(word2vec('<PAD>'))
            n = len(vec_summary)
        #print n
        
        vec_summaries.append(vec_summary)
    
    vec_summaries = np.asarray(vec_summaries,np.float32)
    vec_batches_y.append(vec_summaries)

In [18]:
#Saving processed data in another file.

import pickle

PICK = [vocab_limit,batch_size,vec_batches_x,vec_batches_y,vec_batches_x_pe,vec]

with open('AmazonPICKLE', 'wb') as fp:
    pickle.dump(PICK, fp)