In [None]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np

# load the dataset
data = gutenberg.raw('bryant-stories.txt')
# save to file
with open('bryant-stories.txt','w') as file:
    file.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Wayne\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# load the dataset
with open('bryant-stories.txt','r') as file:
    text = file.read().lower()

# tokenize the text
# created indexs for words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text]) # builds a word-> index dict
total_words = len(tokenizer.word_index)+1
total_words

3943

In [5]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'a': 4,
 'he': 5,
 'of': 6,
 'was': 7,
 'in': 8,
 'it': 9,
 'little': 10,
 'his': 11,
 'that': 12,
 'i': 13,
 'you': 14,
 'said': 15,
 'she': 16,
 'they': 17,
 'for': 18,
 'as': 19,
 'but': 20,
 'so': 21,
 'him': 22,
 'had': 23,
 'her': 24,
 'with': 25,
 'when': 26,
 'on': 27,
 'at': 28,
 'is': 29,
 'all': 30,
 'not': 31,
 'there': 32,
 'out': 33,
 'were': 34,
 'came': 35,
 'me': 36,
 'then': 37,
 'them': 38,
 'one': 39,
 'up': 40,
 'be': 41,
 'this': 42,
 'from': 43,
 'could': 44,
 'very': 45,
 'have': 46,
 'will': 47,
 'down': 48,
 'who': 49,
 'my': 50,
 'king': 51,
 'went': 52,
 'do': 53,
 'what': 54,
 'their': 55,
 'would': 56,
 'no': 57,
 'great': 58,
 'day': 59,
 'are': 60,
 'more': 61,
 'old': 62,
 'by': 63,
 'if': 64,
 'time': 65,
 'see': 66,
 'man': 67,
 'like': 68,
 'saw': 69,
 'away': 70,
 'made': 71,
 'into': 72,
 'your': 73,
 'mother': 74,
 'good': 75,
 'come': 76,
 'over': 77,
 'about': 78,
 'after': 79,
 'jackal': 80,
 'did': 81,
 'where'

In [6]:
inputsequences=[]
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        inputsequences.append(n_gram_sequence)
        


In [7]:
inputsequences

[[404, 3],
 [404, 3, 237],
 [404, 3, 237, 3],
 [404, 3, 237, 3, 169],
 [404, 3, 237, 3, 169, 63],
 [404, 3, 237, 3, 169, 63, 2280],
 [404, 3, 237, 3, 169, 63, 2280, 2281],
 [404, 3, 237, 3, 169, 63, 2280, 2281, 2282],
 [404, 3, 237, 3, 169, 63, 2280, 2281, 2282, 2283],
 [97, 10],
 [97, 10, 2284],
 [97, 10, 2284, 8],
 [97, 10, 2284, 8, 2285],
 [1634, 4],
 [1634, 4, 134],
 [1634, 4, 134, 12],
 [1634, 4, 134, 12, 13],
 [1634, 4, 134, 12, 13, 2286],
 [389, 6],
 [389, 6, 10],
 [389, 6, 10, 2287],
 [10, 1106],
 [10, 1106, 6],
 [10, 1106, 6, 606],
 [10, 1106, 6, 606, 17],
 [10, 1106, 6, 606, 17, 1107],
 [2, 323],
 [2, 323, 2288],
 [2, 323, 2288, 45],
 [2, 323, 2288, 45, 467],
 [43, 124],
 [43, 124, 3],
 [43, 124, 3, 124],
 [43, 124, 3, 124, 5],
 [43, 124, 3, 124, 5, 1635],
 [4, 962],
 [4, 962, 238],
 [4, 962, 238, 2],
 [4, 962, 238, 2, 2290],
 [2, 841],
 [2, 841, 9],
 [2, 841, 9, 2291],
 [2, 841, 9, 2291, 104],
 [2, 841, 9, 2291, 104, 2292],
 [5, 2293],
 [5, 2293, 607],
 [5, 2293, 607, 8],
 [

In [8]:
# padding the sequences
max_sequence_len = max([len(i) for i in inputsequences])
max_sequence_len

19

In [9]:
input_sequence = np.array(pad_sequences(inputsequences, maxlen=max_sequence_len, padding='pre'))
input_sequence

array([[   0,    0,    0, ...,    0,  404,    3],
       [   0,    0,    0, ...,  404,    3,  237],
       [   0,    0,    0, ...,    3,  237,    3],
       ...,
       [   0,    0,    0, ...,  704,   82,  202],
       [   0,    0,    0, ...,   82,  202, 3941],
       [   0,    0,    0, ...,  202, 3941,   60]],
      shape=(42214, 19), dtype=int32)

In [10]:
# create predictors and labels
import tensorflow as tf
x,y=input_sequence[:,:-1],input_sequence[:,-1]

In [11]:
x

array([[   0,    0,    0, ...,    0,    0,  404],
       [   0,    0,    0, ...,    0,  404,    3],
       [   0,    0,    0, ...,  404,    3,  237],
       ...,
       [   0,    0,    0, ...,  645,  704,   82],
       [   0,    0,    0, ...,  704,   82,  202],
       [   0,    0,    0, ...,   82,  202, 3941]],
      shape=(42214, 18), dtype=int32)

In [12]:
y

array([   3,  237,    3, ...,  202, 3941,   60],
      shape=(42214,), dtype=int32)

In [13]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(42214, 3943))

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=42)

In [None]:
# define early stopping 
from tensorflow.keras.callbacks import EarlyStopping

earlystopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True # it rewinds to the last epoch with the lowest loss
)

In [43]:
# train the LSTM RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout

# define the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len))
model.add(LSTM(200,return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_sequence_len))

In [44]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [45]:
model.summary()

In [46]:
history=model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test),verbose=1, callbacks=[earlystopping])

Epoch 1/10
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 34ms/step - accuracy: 0.0788 - loss: 6.2704 - val_accuracy: 0.0916 - val_loss: 6.0508
Epoch 2/10
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 40ms/step - accuracy: 0.0977 - loss: 5.8031 - val_accuracy: 0.1011 - val_loss: 5.8954
Epoch 3/10
[1m1056/1056[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 37ms/step - accuracy: 0.1167 - loss: 5.5292 - val_accuracy: 0.1189 - val_loss: 5.7962


In [20]:
model.save('LSTM_model_1.h5')



In [48]:
import tensorflow as tf
from tensorflow.keras.models import load_model
model=load_model('LSTM_model_1.h5')



In [49]:
# predict function
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list=tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list=token_list[-(max_sequence_len-1):]
    token_list=pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index==predicted_word_index:
            return word
    return None

In [59]:
input_text = "what is the last word"
print(f"Input text: {input_text}")
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next word prediction: {next_word}")

Input text: what is the last word
Next word prediction: and


In [29]:
# save the tokenizer
import pickle
with open("tokenizer.pkl",'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Hyperparameter tuninig

In [47]:
def build_lstm_model(hp):
    model = Sequential()

    # Tunable hyperparameters
    embed_dim = hp.Choice("embed_dim", [50, 100, 150])
    lstm_units = hp.Choice("lstm_units", [100, 150, 200])
    dropout_rate = hp.Choice("dropout_rate", [0.1, 0.2, 0.3])

    model.add(Embedding(total_words, embed_dim, input_length=max_sequence_len-1))
    model.add(LSTM(lstm_units, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(lstm_units // 2))
    model.add(Dense(total_words, activation="softmax"))

    model.compile(
        loss="categorical_crossentropy",
        optimizer="adam"
    )

    return model


In [None]:
import keras_tuner as kt

tuner = kt.RandomSearch(
    build_lstm_model,
    objective="val_loss",
    max_trials=5,
    overwrite=True,
    directory="lstm_tuner",
    project_name="textgen"
)

earlystop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

tuner.search(x, y, epochs=15, validation_split=0.1, callbacks=[earlystop])


Trial 5 Complete [00h 03m 40s]
val_loss: 6.400148868560791

Best val_loss So Far: 6.349607944488525
Total elapsed time: 00h 16m 49s
