In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM

In [2]:
#Importing the data
data=pd.read_table('raw_in_domain_train.tsv',header=None)

In [3]:
data

Unnamed: 0,0,1,2,3
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.
...,...,...,...,...
8546,ad03,0,*,Poseidon appears to own a dragon
8547,ad03,0,*,Digitize is my happiest memory
8548,ad03,1,,It is easy to slay the Gorgon.
8549,ad03,1,,I had the strangest feeling that I knew you.


In [4]:
##Transform the texts column into a list
texts=data[3].tolist()

In [5]:
##Function for cleaning texts
def clean_text(text): 
    text=re.sub("(\?|!)+"," ",text) #remvoe ? and !
    text=re.sub("\s\d+\s","",text) # remove digits 
    text=re.sub("(\.|\,)+","",text) #remove . and ,
    text=re.sub("^\s+","",text) #remove space in the begining
    text=re.sub("\s+$","",text) #remove space in the end
    
    return text

In [6]:
##Function for processing texts
def process_sentence(texts):
    clean_texts=[]
    for text in texts:
        text=clean_text(text) #Cleaning the text
        clean_texts.append(text.lower()) #Lowercasing the texts
    return clean_texts

In [7]:
clean_texts=process_sentence(texts)
clean_texts[:5]

["our friends won't buy this analysis let alone the next one we propose",
 "one more pseudo generalization and i'm giving up",
 "one more pseudo generalization or i'm giving up",
 'the more we study verbs the crazier they get',
 'day by day the facts are getting murkier']

In [8]:
#Tokenization Process and making sequence of tokens for each text
max_words = 6000 # Max size of the dictionary
tokenizer = Tokenizer(num_words=max_words) # Size of dictionary containig each word and its index
tokenizer.fit_on_texts(clean_texts) # Tokenize each word
sequences = tokenizer.texts_to_sequences(clean_texts) # convert the texts into tokenized sequences
vocab_size = len(tokenizer.word_index) # Number of unique words

In [9]:
sequences[:5],vocab_size

([[221, 271, 253, 164, 28, 698, 753, 1288, 1, 699, 79, 29, 3571],
  [79, 25, 2624, 2625, 9, 144, 1478, 49],
  [79, 25, 2624, 2625, 138, 144, 1478, 49],
  [1, 25, 29, 448, 3572, 1, 1289, 30, 210],
  [427, 34, 427, 1, 3573, 45, 902, 3574]],
 5524)

In [10]:
len(max(sequences,key=len)) #Max is a built in function to find the longest text

42

In [11]:
#Padding of the encoded texts
padded_texts=pad_sequences(sequences,maxlen=43,padding='pre')
padded_texts[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,  221,  271,  253,
         164,   28,  698,  753, 1288,    1,  699,   79,   29, 3571],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   79,   25, 2624, 2625,    9,  144, 1478,   49],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   79,   25, 2624, 2625,  138,  144, 1478,   49],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    

In [12]:
#Obtaining the input vector X and the label vector Y
import keras.utils as ku
label = ku.to_categorical(padded_texts[:,-1], num_classes=vocab_size)
y=label
x=padded_texts[:,:-1]
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.1)

In [18]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size+1,output_dim=10,input_length=42))
model.add(LSTM(40,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(vocab_size,activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 42, 10)            55250     
                                                                 
 lstm_1 (LSTM)               (None, 40)                8160      
                                                                 
 dense_2 (Dense)             (None, 100)               4100      
                                                                 
 dense_3 (Dense)             (None, 5524)              557924    
                                                                 
Total params: 625,434
Trainable params: 625,434
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(train_x,train_y,epochs=100,batch_size=64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2357ce6d520>

In [20]:
#Test Evaluation
model.evaluate(test_x,test_y)



[0.24387529492378235, 0.9217289686203003]

In [21]:
#Evaluation for overfitting
model.evaluate(train_x,train_y)



[0.23521637916564941, 0.9272254705429077]

In [24]:
#fuction to generate text including the text sent,number of words generated, model
def generate_text(text, words_generated, model):
    for i in range(words_generated):
        token_list = tokenizer.texts_to_sequences([text])[0]  #Tokenize the new text
        token_list = pad_sequences([token_list], maxlen=42, padding='pre') #padding the new token
        predicted = model.predict(token_list) #Prediction
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted.argmax(): #get the index of the max output and getting the word from the tokenizer
                output_word = word
                break
        text += " "+output_word
    return text

In [25]:
#Text Generation
generate_text('i dont remember',4,model)



'i dont remember sugar fool helen medicine'