In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re

In [3]:
#load the data into memory 
def load_doc(filename):
  #open the file in read only mode 
  file = open(filename,'r')
  #read all the text
  text = file.read()
  #close the file
  file.close()
  return text

In [4]:
def clean_doc(doc):
  #replace '--' with a space ' '
  doc = doc.replace('--', ' ')
  #remove all punctuations
  doc = re.sub(r'[^\w\s]','',doc)
  #word tokenization 
  tokens = doc.split()
  #remove remaining tokens that are not alphanumeric
  tokens = [word for word in tokens if word.isalpha()]
  #make lower case 
  tokens = [word.lower() for word in tokens]
  return tokens

In [11]:
#load the document
in_filename = 'republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:202])

#clean the document
tokens = clean_doc(doc)
print(tokens[:200])

﻿INTRODUCTION AND ANALYSIS.

The Republic of Plato is the longest of his works with the exception
of the Laws, and is certainly the greatest of them. There are nearer
approaches to modern metaphysics in
['introduction', 'and', 'analysis', 'the', 'republic', 'of', 'plato', 'is', 'the', 'longest', 'of', 'his', 'works', 'with', 'the', 'exception', 'of', 'the', 'laws', 'and', 'is', 'certainly', 'the', 'greatest', 'of', 'them', 'there', 'are', 'nearer', 'approaches', 'to', 'modern', 'metaphysics', 'in', 'the', 'philebus', 'and', 'in', 'the', 'sophist', 'the', 'politicus', 'or', 'statesman', 'is', 'more', 'ideal', 'the', 'form', 'and', 'institutions', 'of', 'the', 'state', 'are', 'more', 'clearly', 'drawn', 'out', 'in', 'the', 'laws', 'as', 'works', 'of', 'art', 'the', 'symposium', 'and', 'the', 'protagoras', 'are', 'of', 'higher', 'excellence', 'but', 'no', 'other', 'dialogue', 'of', 'plato', 'has', 'the', 'same', 'largeness', 'of', 'view', 'and', 'the', 'same', 'perfection', 'of', 'style',

In [None]:
tokens

In [13]:
#Train - 50words - Predict 51th word
#First 50- X, 51st word - y - BPTT
length = 50+1
sequences = list()
#51 ------> len(no. words in corpus)
for i in range(length, len(tokens)):
  #Make sequence of tokens 
  #First i - length = 0, 51
  seq = tokens[i-length:i]
  #convert this into a line 
  line = " ".join(seq)
  sequences.append(line)

print("Total number of sequences: %d" %len(sequences))
print("The each sequence is of length %d" % len(sequences[0].split()))


Total number of sequences: 216640
The each sequence is of length 51


In [14]:
from numpy import array 
from pickle import dump 
from keras.preprocessing.text import Tokenizer 
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential 
from keras.layers import Dense
from keras.layers import LSTM 
from keras.layers import Embedding

In [15]:
#interger encoding of sequence of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)

In [16]:
vocab_size = len(tokenizer.word_index) + 1

In [17]:
vocab_size

10437

In [18]:
sequences = tokenizer.texts_to_sequences(sequences)

In [20]:
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y,num_classes=vocab_size)
seq_length = X.shape[1]

In [21]:
#define the model 
model = Sequential()
model.add(Embedding(vocab_size,50,input_length = seq_length))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation = 'relu'))
model.add(Dense(vocab_size,activation= 'softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            521850    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10437)             1054137   
Total params: 1,726,887
Trainable params: 1,726,887
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam',metrics = ['accuracy'])
#fit model 
model.fit(X,y,batch_size= 128,epochs = 100)

Epoch 1/100


KeyboardInterrupt: ignored

In [None]:
#save a keras model 
model.save('model.h5')

#save the tokenizer 
dump(tokenizer,open('tokenizer.pkl','wb'))

In [25]:
#load a model 
from keras.models import load_model 
from pickle import load 
from keras.preprocessing.sequence import pad_sequences

#loading a model 
model = load_model('model.h5')


#loading a tokenizer 
tokenizer = load(open('tokenizer.pkl','rb'))

In [33]:
#Generating Seq from our language model 
def generate_seq(model,tokenizer,seq_length,in_text,n_words):
  results = list()
  #generate a fixed number of words
  for _ in range(n_words):
    #encode the text to integers
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    #trunctae the sequences to fixed length
    encoded = pad_sequences([encoded],maxlen= seq_length,truncating = 'pre') 
    #predict probability for each word
    yhat = model.predict_classes(encoded,verbose = 0)

    #map the predicted index to word
    out_word = ''
    for word,index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    in_text += ' '+out_word
    results.append(out_word)
  return ' '.join(results)

In [34]:
tokenizer.word_index.items()

dict_items([('the', 1), ('of', 2), ('and', 3), ('to', 4), ('is', 5), ('in', 6), ('a', 7), ('he', 8), ('that', 9), ('be', 10), ('which', 11), ('not', 12), ('or', 13), ('are', 14), ('they', 15), ('i', 16), ('as', 17), ('will', 18), ('but', 19), ('have', 20), ('we', 21), ('you', 22), ('by', 23), ('his', 24), ('them', 25), ('for', 26), ('said', 27), ('with', 28), ('their', 29), ('who', 30), ('this', 31), ('one', 32), ('there', 33), ('all', 34), ('at', 35), ('has', 36), ('may', 37), ('what', 38), ('from', 39), ('if', 40), ('when', 41), ('then', 42), ('other', 43), ('no', 44), ('him', 45), ('state', 46), ('good', 47), ('an', 48), ('any', 49), ('was', 50), ('only', 51), ('would', 52), ('more', 53), ('man', 54), ('do', 55), ('true', 56), ('on', 57), ('our', 58), ('than', 59), ('like', 60), ('must', 61), ('yes', 62), ('were', 63), ('been', 64), ('can', 65), ('should', 66), ('into', 67), ('such', 68), ('us', 69), ('men', 70), ('say', 71), ('so', 72), ('it', 73), ('also', 74), ('own', 75), ('life

In [35]:
in_text = "Hi i am taking classes at inceptez"

#generate new text 
generated = generate_seq(model,tokenizer,seq_length,in_text,5)
print(generated)



the commencement of the admiration
