In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

In [2]:
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

In [3]:
#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))

In [4]:
# tokenizing
from nltk import word_tokenize,sent_tokenize

In [5]:
#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

Using TensorFlow backend.


In [6]:
sample_text_1="bitty bought a bit of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp=[sample_text_1,sample_text_2,sample_text_3]
no_docs=len(corp)

In [7]:
vocab_size=50 
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [45, 10, 30, 41, 14, 34]
The encoding for document 2  is :  [17, 13, 41, 14, 34, 21, 30, 41, 22]
The encoding for document 3  is :  [24, 4, 10, 45, 46, 34, 8, 4, 13, 22, 34, 46]


In [8]:
maxlen=-1
for doc in corp:
    tokens=nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen=len(tokens)
print("The maximum number of words in any document is : ",maxlen)

The maximum number of words in any document is :  12


In [9]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

No of padded documents:  3


In [10]:
for i,doc in enumerate(pad_corp):
     print("The padded encoding for document",i+1," is : ",doc)

The padded encoding for document 1  is :  [45 10 30 41 14 34  0  0  0  0  0  0]
The padded encoding for document 2  is :  [17 13 41 14 34 21 30 41 22  0  0  0]
The padded encoding for document 3  is :  [24  4 10 45 46 34  8  4 13 22 34 46]


In [11]:
# specifying the input shape
input=Input(shape=(no_docs,maxlen),dtype='float64')

In [12]:
'''
shape of input. 
each document has 12 element or words which is the value of our maxlen variable.

'''
word_input=Input(shape=(maxlen,),dtype='float64')  

# creating the embedding
word_embedding=Embedding(input_dim=vocab_size,output_dim=8,input_length=maxlen)(word_input)

word_vec=Flatten()(word_embedding) # flatten
embed_model =Model([word_input],word_vec) # combining all into a Keras model

'\nshape of input. \neach document has 12 element or words which is the value of our maxlen variable.\n\n'

In [13]:
embed_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc']) 
# compiling the model. parameters can be tuned as always.

In [14]:
print(type(word_embedding))
print(word_embedding)

<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(?, 12, 8), dtype=float32)


In [15]:
print(embed_model.summary()) # summary of the model

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 12)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 12, 8)             400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 96)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
embeddings=embed_model.predict(pad_corp) # finally getting the embeddings.




In [17]:
print("Shape of embeddings : ",embeddings.shape)
print(embeddings)

Shape of embeddings :  (3, 96)
[[-0.0260285  -0.02942736 -0.0190503   0.01000122 -0.04945574  0.01387862
   0.01306662  0.02715177 -0.02688781  0.02109814 -0.00466151 -0.04597318
  -0.04593098 -0.01970681  0.00707368  0.04694458 -0.01870829  0.03552436
  -0.0456105  -0.00230848 -0.04606737 -0.00733572  0.02697546 -0.0380674
  -0.00057902 -0.01489658 -0.02773886  0.00620048 -0.04539012 -0.02591311
   0.04654991  0.0150856  -0.00275113 -0.01618331 -0.00174323 -0.027293
   0.04865906  0.04321125  0.00504465 -0.03549979 -0.00733699  0.01378724
   0.01717141  0.0188974   0.03243555  0.0045498  -0.04819179 -0.03433678
   0.04971239 -0.0002454  -0.04866038 -0.02172011  0.04668837  0.04231835
   0.04568959 -0.02697194  0.04971239 -0.0002454  -0.04866038 -0.02172011
   0.04668837  0.04231835  0.04568959 -0.02697194  0.04971239 -0.0002454
  -0.04866038 -0.02172011  0.04668837  0.04231835  0.04568959 -0.02697194
   0.04971239 -0.0002454  -0.04866038 -0.02172011  0.04668837  0.04231835
   0.045689

In [18]:
embeddings=embeddings.reshape(-1,maxlen,8)
print("Shape of embeddings : ",embeddings.shape) 
print(embeddings)

Shape of embeddings :  (3, 12, 8)
[[[-0.0260285  -0.02942736 -0.0190503   0.01000122 -0.04945574
    0.01387862  0.01306662  0.02715177]
  [-0.02688781  0.02109814 -0.00466151 -0.04597318 -0.04593098
   -0.01970681  0.00707368  0.04694458]
  [-0.01870829  0.03552436 -0.0456105  -0.00230848 -0.04606737
   -0.00733572  0.02697546 -0.0380674 ]
  [-0.00057902 -0.01489658 -0.02773886  0.00620048 -0.04539012
   -0.02591311  0.04654991  0.0150856 ]
  [-0.00275113 -0.01618331 -0.00174323 -0.027293    0.04865906
    0.04321125  0.00504465 -0.03549979]
  [-0.00733699  0.01378724  0.01717141  0.0188974   0.03243555
    0.0045498  -0.04819179 -0.03433678]
  [ 0.04971239 -0.0002454  -0.04866038 -0.02172011  0.04668837
    0.04231835  0.04568959 -0.02697194]
  [ 0.04971239 -0.0002454  -0.04866038 -0.02172011  0.04668837
    0.04231835  0.04568959 -0.02697194]
  [ 0.04971239 -0.0002454  -0.04866038 -0.02172011  0.04668837
    0.04231835  0.04568959 -0.02697194]
  [ 0.04971239 -0.0002454  -0.04866038 

In [19]:
for i,doc in enumerate(embeddings):
    for j,word in enumerate(doc):
        print("The encoding for ",j+1,"th word","in",i+1,"th document is : \n\n",word)

The encoding for  1 th word in 1 th document is : 

 [-0.0260285  -0.02942736 -0.0190503   0.01000122 -0.04945574  0.01387862
  0.01306662  0.02715177]
The encoding for  2 th word in 1 th document is : 

 [-0.02688781  0.02109814 -0.00466151 -0.04597318 -0.04593098 -0.01970681
  0.00707368  0.04694458]
The encoding for  3 th word in 1 th document is : 

 [-0.01870829  0.03552436 -0.0456105  -0.00230848 -0.04606737 -0.00733572
  0.02697546 -0.0380674 ]
The encoding for  4 th word in 1 th document is : 

 [-0.00057902 -0.01489658 -0.02773886  0.00620048 -0.04539012 -0.02591311
  0.04654991  0.0150856 ]
The encoding for  5 th word in 1 th document is : 

 [-0.00275113 -0.01618331 -0.00174323 -0.027293    0.04865906  0.04321125
  0.00504465 -0.03549979]
The encoding for  6 th word in 1 th document is : 

 [-0.00733699  0.01378724  0.01717141  0.0188974   0.03243555  0.0045498
 -0.04819179 -0.03433678]
The encoding for  7 th word in 1 th document is : 

 [ 0.04971239 -0.0002454  -0.04866038