In [3]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np

[nltk_data] Downloading package gutenberg to C:\Users\Himank
[nltk_data]     K\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [4]:
data = gutenberg.raw('shakespeare-hamlet.txt')
with open('hamlet.txt', 'w') as file:
    file.write(data)

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split




In [6]:
with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [7]:
tokenizer.word_counts

OrderedDict([('the', 993),
             ('tragedie', 4),
             ('of', 610),
             ('hamlet', 100),
             ('by', 105),
             ('william', 1),
             ('shakespeare', 1),
             ('1599', 1),
             ('actus', 2),
             ('primus', 1),
             ('scoena', 1),
             ('prima', 1),
             ('enter', 85),
             ('barnardo', 8),
             ('and', 862),
             ('francisco', 2),
             ('two', 22),
             ('centinels', 1),
             ("who's", 2),
             ('there', 76),
             ('fran', 8),
             ('nay', 26),
             ('answer', 9),
             ('me', 228),
             ('stand', 15),
             ('vnfold', 3),
             ('your', 253),
             ('selfe', 68),
             ('bar', 7),
             ('long', 17),
             ('liue', 15),
             ('king', 171),
             ('he', 196),
             ('you', 522),
             ('come', 104),
             ('most', 77),
  

In [8]:
input_sequences = []
for line in text.split('\n:'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [9]:
len(input_sequences)

29697

In [10]:
max_sequence_len = max([len(i) for i in input_sequences])
max_sequence_len

29698

In [12]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [13]:
input_sequences[0]

array([  0,   0,   0, ...,   0,   1, 687])

In [14]:
import tensorflow as tf
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [15]:
y

array([ 687,    4,   45, ..., 1047,    4,  193])

In [16]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout 
from tensorflow.keras.callbacks import EarlyStopping

In [22]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation="softmax"))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 29697, 100)        481800    
                                                                 
 lstm_2 (LSTM)               (None, 100)               80400     
                                                                 
 dense_1 (Dense)             (None, 4818)              486618    
                                                                 
Total params: 1048818 (4.00 MB)
Trainable params: 1048818 (4.00 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping_callback = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

In [24]:
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=100, verbose=1
                     )

Epoch 1/100


  1/743 [..............................] - ETA: 18:12:58 - loss: 8.4805 - accuracy: 0.0000e+00

KeyboardInterrupt: 

In [None]:
model.save('next_word_lstm.h5')

In [21]:
import pickle
with open('tokeinzer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)