In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2

with open("corpus.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

In [4]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [3]:
mytokenizer.word_index

{'the': 1,
 'of': 2,
 'a': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'we': 7,
 'that': 8,
 'for': 9,
 'on': 10,
 'deep': 11,
 'is': 12,
 'neural': 13,
 'networks': 14,
 'learning': 15,
 'with': 16,
 'by': 17,
 'this': 18,
 'as': 19,
 'network': 20,
 'training': 21,
 'be': 22,
 'are': 23,
 'can': 24,
 'an': 25,
 'model': 26,
 'our': 27,
 'data': 28,
 'using': 29,
 'from': 30,
 'which': 31,
 'models': 32,
 'show': 33,
 'based': 34,
 'performance': 35,
 'have': 36,
 'gradient': 37,
 'it': 38,
 'layers': 39,
 'such': 40,
 'new': 41,
 'dnn': 42,
 'results': 43,
 'layer': 44,
 'method': 45,
 'methods': 46,
 'approach': 47,
 'these': 48,
 'or': 49,
 'recognition': 50,
 'more': 51,
 'algorithm': 52,
 'parameters': 53,
 'function': 54,
 'at': 55,
 'each': 56,
 'not': 57,
 'state': 58,
 'art': 59,
 'paper': 60,
 'number': 61,
 'large': 62,
 'optimization': 63,
 'rnn': 64,
 'input': 65,
 'speech': 66,
 'has': 67,
 'stochastic': 68,
 'propose': 69,
 'convolutional': 70,
 'features': 71,
 'used': 72,
 'cl

In [6]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(mytokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
# Reading corpus the text file
with open("chunk_1.txt", 'r', encoding='utf-8') as myfile:
    mytext1 = myfile.read()

In [46]:
my_input_sequences = []
for line in mytext1.split('\n'):
    token_list = mytokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        my_input_sequences.append(my_n_gram_sequence)

In [47]:
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [21]:
input_sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [22]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [23]:
X[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [24]:
y

array([ 998,    5, 1453, ...,   17, 1323,  167])

In [25]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [26]:
y[1]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [12]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.5))  
model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.5))  
model.add(Dense(total_words, activation='softmax', kernel_regularizer=l2(0.01)))  
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 294, 100)          261900    
                                                                 
 bidirectional_2 (Bidirectio  (None, 294, 300)         301200    
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 294, 300)          0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 300)              541200    
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 300)               0         
                                                                 
 dense_1 (Dense)             (None, 2619)             

In [28]:

model.fit(X, y, epochs=25, verbose=1,batch_size=64,callbacks=[reduce_lr, early_stopping])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1ee3cdbda30>

In [30]:
input_text = "Neural Network"
predict_next_words= 6

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[13, 20]
[13, 20, 593]
[13, 20, 593, 593]
[13, 20, 593, 593, 593]
[13, 20, 593, 593, 593, 593]
[13, 20, 593, 593, 593, 593, 593]
Neural Network overhead overhead overhead overhead overhead overhead


In [32]:
# Save the model
model.save('my_model')




INFO:tensorflow:Assets written to: my_model\assets


INFO:tensorflow:Assets written to: my_model\assets


In [36]:
from keras.models import load_model
loaded_model = load_model('my_model')
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
with open("chunk_4.txt", 'r', encoding='utf-8') as myfile:
    mytext2 = myfile.read()

In [11]:
my_input_sequences = []
for line in mytext2.split('\n'):
    token_list = mytokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        my_input_sequences.append(my_n_gram_sequence)

In [12]:
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [13]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [14]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [42]:
loaded_model.fit(X, y, epochs=25, verbose=1,batch_size=64,callbacks=[reduce_lr, early_stopping])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1c222050310>

In [48]:
loaded_model.save('my_model')



INFO:tensorflow:Assets written to: my_model\assets


INFO:tensorflow:Assets written to: my_model\assets


In [None]:
from keras.models import load_model
loaded_model = load_model('my_model')
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
input_text = "Neural Network" 
predict_next_words= 4

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(loaded_model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

overfitting the network is used
