In [1]:
import nltk
from nltk.corpus import brown
nltk.download('brown')

import numpy as np

#from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import LSTM
from keras.models import Model
from keras.layers import Input

from keras.losses import CosineSimilarity

from annoy import AnnoyIndex
import random

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
corpus = brown.sents()

In [3]:
max_sequence_length = 32

## The function "createDic:"
* Input: list of lists (sentences).
* Output: dictionary contians the words where each word has an index.

In [4]:
def createDic(listOflists):
    i=1
    dic={}
    for lst in listOflists:
        for word in lst:
            if word.lower() not in dic:
                dic[word.lower()] = i
                i+=1
    return dic

In [5]:
myDic = createDic(corpus)
myDic

{'the': 1,
 'fulton': 2,
 'county': 3,
 'grand': 4,
 'jury': 5,
 'said': 6,
 'friday': 7,
 'an': 8,
 'investigation': 9,
 'of': 10,
 "atlanta's": 11,
 'recent': 12,
 'primary': 13,
 'election': 14,
 'produced': 15,
 '``': 16,
 'no': 17,
 'evidence': 18,
 "''": 19,
 'that': 20,
 'any': 21,
 'irregularities': 22,
 'took': 23,
 'place': 24,
 '.': 25,
 'further': 26,
 'in': 27,
 'term-end': 28,
 'presentments': 29,
 'city': 30,
 'executive': 31,
 'committee': 32,
 ',': 33,
 'which': 34,
 'had': 35,
 'over-all': 36,
 'charge': 37,
 'deserves': 38,
 'praise': 39,
 'and': 40,
 'thanks': 41,
 'atlanta': 42,
 'for': 43,
 'manner': 44,
 'was': 45,
 'conducted': 46,
 'september-october': 47,
 'term': 48,
 'been': 49,
 'charged': 50,
 'by': 51,
 'superior': 52,
 'court': 53,
 'judge': 54,
 'durwood': 55,
 'pye': 56,
 'to': 57,
 'investigate': 58,
 'reports': 59,
 'possible': 60,
 'hard-fought': 61,
 'won': 62,
 'mayor-nominate': 63,
 'ivan': 64,
 'allen': 65,
 'jr.': 66,
 'only': 67,
 'a': 68,
 'r

In [6]:
index_word = {v: k for k, v in myDic.items()}
index_word

{1: 'the',
 2: 'fulton',
 3: 'county',
 4: 'grand',
 5: 'jury',
 6: 'said',
 7: 'friday',
 8: 'an',
 9: 'investigation',
 10: 'of',
 11: "atlanta's",
 12: 'recent',
 13: 'primary',
 14: 'election',
 15: 'produced',
 16: '``',
 17: 'no',
 18: 'evidence',
 19: "''",
 20: 'that',
 21: 'any',
 22: 'irregularities',
 23: 'took',
 24: 'place',
 25: '.',
 26: 'further',
 27: 'in',
 28: 'term-end',
 29: 'presentments',
 30: 'city',
 31: 'executive',
 32: 'committee',
 33: ',',
 34: 'which',
 35: 'had',
 36: 'over-all',
 37: 'charge',
 38: 'deserves',
 39: 'praise',
 40: 'and',
 41: 'thanks',
 42: 'atlanta',
 43: 'for',
 44: 'manner',
 45: 'was',
 46: 'conducted',
 47: 'september-october',
 48: 'term',
 49: 'been',
 50: 'charged',
 51: 'by',
 52: 'superior',
 53: 'court',
 54: 'judge',
 55: 'durwood',
 56: 'pye',
 57: 'to',
 58: 'investigate',
 59: 'reports',
 60: 'possible',
 61: 'hard-fought',
 62: 'won',
 63: 'mayor-nominate',
 64: 'ivan',
 65: 'allen',
 66: 'jr.',
 67: 'only',
 68: 'a',
 69

In [7]:
num_words = len(myDic)+1
num_words

49816

## The function "string_to_model_input:"
* Input: list of words.
* Output: tuple of two vectors.
 * create a list of indexes "ids" depending on the dictionary "myDic" the take the first max_sequence_length numbers.
 * remove the last index from ids list, if the length is less than max_sequence_length then put zeros.
 * remove the first index from ids list, if the length is less than max_sequence_length then put zeros.

In [8]:
def string_to_model_input(sentence):
    ids = [myDic[word.lower()] if word.lower() in myDic else 0 for word in sentence][:max_sequence_length]
    l=len(ids)
    X = ids[:-1]+[0]*(max_sequence_length-l+1)
    Y = ids[1:]+[0]*(max_sequence_length-l+1)
    return (np.array(X),np.array(Y))

## Calling the function "string_to_model_input" and preparing the data.

In [9]:
DataSet = [string_to_model_input(sentence)  for sentence in corpus]

In [10]:
inputData = np.array([tup[0] for tup in DataSet])
outputData = np.array([tup[1] for tup in DataSet])

In [11]:
inputData

array([[   1,    2,    3, ...,    0,    0,    0],
       [   1,    5,   26, ...,   30,   10,    0],
       [   1,   47,   48, ...,   51,   63,    0],
       ...,
       [   1, 4077,   10, ...,   27, 1019,    0],
       [3023,   45,   68, ...,   40,    1,    0],
       [ 255,  551,  724, ...,    0,    0,    0]])

In [12]:
outputData

array([[   2,    3,    4, ...,    0,    0,    0],
       [   5,   26,    6, ...,   10,   42,    0],
       [  47,   48,    5, ...,   63,   64,    0],
       ...,
       [4077,   10,    1, ..., 1019,  536,    0],
       [  45,   68, 1400, ...,    1, 2947,    0],
       [ 551,  724,   45, ...,    0,    0,    0]])

In [13]:
inputData.shape, outputData.shape

((57340, 32), (57340, 32))

## Define the keras model.
* An Input layer
* An Embeddings layer.
* An LSTM layer.		
* A Dense layer with a softmax activation for the output.
 * The outputs will be a list contians the output of the model and state cells of the LSTM layer. 

In [27]:
visible = Input(shape=max_sequence_length)
hidden1 = Embedding(num_words, 128, input_length=max_sequence_length)(visible)
hidden2 , state_h , state_c = LSTM(32, return_state=True)(hidden1)
output = Dense(max_sequence_length, activation='softmax')(hidden2)
model = Model(inputs=visible, outputs=[output, state_c])
model.compile(loss='CategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])
# summarize layers
print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 128)           6376448   
_________________________________________________________________
lstm_1 (LSTM)                [(None, 32), (None, 32),  20608     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
Total params: 6,398,112
Trainable params: 6,398,112
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
# fit the keras model on the dataset
model.fit(inputData, outputData, epochs=5  , batch_size=512)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x22154776be0>

In [16]:
def get_cell_state(word_id_vector):
    predictions = model.predict(word_id_vector)
    return predictions[1]

In [17]:
def predict_next_word(word_sequence):
    ids = string_to_model_input(word_sequence)
    predictions = model.predict(ids[0])
    return index_word[int(max(predictions[0][-1])*num_words)]

In [18]:
def inputField(): 
    sentence = input('Enter your sentence: ')
    lst = sentence.split(' ')
    return lst

## Here you can input a sentence and the model predicts the next word.

In [33]:
lst = inputField()
print("Your sentence:" , ' '.join(lst))
print("Your sentence with a possible suffix word:" , ' '.join(lst)+' '+ predict_next_word(lst))

Enter your sentence: I am thinking about
Your sentence: I am thinking about
Your sentence with a possible suffix word: I am thinking about facing


In [20]:
def cosine_similarity(a,b):
    cosine_loss = CosineSimilarity(axis=1)
    return cosine_loss(a,b).numpy()

## Here you can input two sentences and you will get the similarity between them.

In [21]:
def cosineSimField():
    lst1 = inputField()
    lst2 = inputField()
    ids1 = string_to_model_input(lst1)
    ids2 = string_to_model_input(lst1)
    a = get_cell_state(ids1[0])
    b = get_cell_state(ids2[0])
    value = cosine_similarity(a,b)
    print("\nThe first sentence:",' '.join(lst1))
    print("The second sentence:", ' '.join(lst2))
    print("The similarity between the two sentences is:" , value)
cosineSimField()

Enter your sentence: He plays football
Enter your sentence: I read a book

The first sentence: He plays football
The second sentence: I read a book
The similarity between the two sentences is: -0.9999999


## Mini search engine.

In [22]:
f = max_sequence_length
t = AnnoyIndex(f, 'angular')  

In [23]:
for ind,sen in enumerate(corpus):
    lst = []
    for word in sen:
        v = myDic[word.lower()]
        lst.append(v)
    lst = lst[:f]
    L = len(lst)
    lst = lst + [0]*(f-L)
    t.add_item(ind, lst)

In [24]:
t.build(10) # 10 trees
t.save('test.ann')
t.load('test.ann') 

True

## Here you can input two sentences and the search engine will give you the 5 nearest neighbors from the index.

In [25]:
def FiveNearest():
    lst3 = inputField()
    lst3toNum = [myDic[word.lower()] for word in lst3][:f]
    L3 = len(lst3toNum)
    lst3toNum = lst3toNum + [0]*(f-L3)
    fiveInd = t.get_nns_by_vector(lst3toNum, 5, search_k=-1, include_distances=False)
    print(fiveInd)
    for i in fiveInd:
        print('\n' , ' '.join(corpus[i]))
FiveNearest()

Enter your sentence: This is my book
[14293, 20569, 24172, 49326, 38156]

 there is every indication it will continue .

 It was a worker .

 He got her dozens of them .

 I can see Dan .

 `` I can fix him something later in the afternoon when we get home '' .
