In [16]:
import numpy as np 
from numpy import array
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [7]:
#Function to read the text data into memory 
import re

def load_doc(filename):
    #open the file in read only mode 
    file = open(filename,'r')
    #read all the text 
    text = file.read()
    #close the file
    file.close()
    return text

In [3]:
in_filename = "republic_sequences.txt"
doc = load_doc(in_filename)
doc = doc[:100000]

In [5]:
def clean_doc(doc):
    #remove all the punctuations
    doc = re.sub(r'[^\w\s]','',doc)
    #word tokenization
    tokens = doc.split()
    #remove anything other than alphanumeric words
    tokens = [word for word in tokens if word.isalpha()]
    #convert it to lower
    tokens = [word.lower() for word in tokens]
    return tokens

In [8]:
#clean document 
tokens = clean_doc(doc)

In [9]:
print(tokens[:100])

['introduction', 'and', 'analysis', 'the', 'republic', 'of', 'plato', 'is', 'the', 'longest', 'of', 'his', 'works', 'with', 'the', 'exception', 'of', 'the', 'laws', 'and', 'is', 'certainly', 'the', 'greatest', 'of', 'them', 'there', 'are', 'nearer', 'approaches', 'to', 'modern', 'metaphysics', 'in', 'the', 'philebus', 'and', 'in', 'the', 'sophist', 'the', 'politicus', 'or', 'statesman', 'is', 'more', 'ideal', 'the', 'form', 'and', 'institutions', 'and', 'analysis', 'the', 'republic', 'of', 'plato', 'is', 'the', 'longest', 'of', 'his', 'works', 'with', 'the', 'exception', 'of', 'the', 'laws', 'and', 'is', 'certainly', 'the', 'greatest', 'of', 'them', 'there', 'are', 'nearer', 'approaches', 'to', 'modern', 'metaphysics', 'in', 'the', 'philebus', 'and', 'in', 'the', 'sophist', 'the', 'politicus', 'or', 'statesman', 'is', 'more', 'ideal', 'the', 'form', 'and']


In [None]:
X                                   Y
--------------------000000000000000 -
--------------000000000000000000000 -

In [10]:
#Train - 50 words - Predict the 51st 
#First 50 - X, 51st word - y

length = 50+1
sequences = list()
# 51 - len(no words in the corpus)
for i in range(length,len(tokens)): #= [0-51] - [i-length,length]
    #Make sequence of tokens
    #First i -length = 0,51
    seq = tokens[i-length:i]
    line = " ".join(seq)
    sequences.append(line)

print("The total no. of sequences", len(sequences))

The total no. of sequences 17676


In [14]:
print("Each sequence is of length:",len(sequences[0].split()))

Each sequence is of length: 51


In [17]:
#Encoding my sequence if words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)

In [18]:
vocab_size = len(tokenizer.word_index)+1

In [21]:
sequences = tokenizer.texts_to_sequences(sequences)

In [23]:
sequences = array(sequences)
X,y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y,num_classes = vocab_size)
seq_length = X.shape[1]

In [62]:
vocab_size

205

In [25]:
#Define the model 
model = Sequential()
model.add(Embedding(vocab_size,50,input_length = seq_length))
model.add(LSTM(100,return_sequences = True))
model.add(LSTM(100))
model.add(Dense(100,activation= 'relu'))
model.add(Dense(vocab_size,activation = 'softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            10250     
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 205)               20705     
Total params: 181,855
Trainable params: 181,855
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.fit(X,y,batch_size = 128,epochs = 10)

Train on 17676 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdb246669e8>

In [59]:
from tensorflow.keras.models import load_model 
from pickle import load 
from tensorflow.keras.preprocessing.sequence import pad_sequences


#Generating Seq from our Language model 
def generate_seq(model,tokenizer,seq_length,in_text,n_words):
    #results = list()
    #Generate a fixed no of words
    #results.append(in_text)
    for _ in range(n_words):
        #encode the text to integers 
        encoded = tokenizer.texts_to_sequences([in_text])
        #truncate or pad 
        encoded = pad_sequences(encoded,maxlen = seq_length,truncating = 'pre')
        #predict probability for each word
        yhat = model.predict_classes(encoded,verbose = 0)
        #print(yhat)
        #map the predicted index to word 
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                #print(word)
                out_word = word
                break
        in_text+=" "+out_word
    return in_text

In [60]:
in_text = 'Hi I am taking classes at inceptez'

generated = generate_seq(model,tokenizer,seq_length,in_text,9)

In [61]:
generated

'Hi I am taking classes at inceptez there a deeper who conceived a method of knowledge'