In [2]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
import re

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
#df = gutenberg.raw("shakespeare-hamlet.txt")
# Read the file content
sentences = []
with open("data.txt", "r") as file:
    df = file.read()

# Process each line and sentence
for line in df.splitlines():
    line = line.strip()
    if line:
        for sent in line.split("."):
            sent = re.sub(r'[^a-zA-Z\' ]', '', sent).strip()
            if sent:
                words = sent.lower().split()
                while len(words) > 8:
                    sentences.append(' '.join(words[:8]))
                    words = words[8:]
                if words:
                    sentences.append(' '.join(words))

In [4]:
sentences

["project gutenberg's the adventures of sherlock holmes by",
 'arthur conan doyle',
 'this ebook is for the use of anyone',
 'anywhere at no cost and with',
 'almost no restrictions whatsoever',
 'you may copy it give it away or',
 'reuse it under the terms of the project',
 'gutenberg license included',
 'with this ebook or online at www',
 'gutenberg',
 'net',
 'title the adventures of sherlock holmes',
 'author arthur conan doyle',
 'release date november ebook',
 'last updated may',
 'language english',
 'character set encoding utf',
 'start of this project gutenberg ebook the adventures',
 'of sherlock holmes',
 'produced by an anonymous project gutenberg volunteer and',
 'jose menendez',
 'cover',
 'the adventures of sherlock holmes',
 'by arthur conan doyle',
 'contents',
 'i',
 'a scandal in bohemia',
 'ii',
 'the redheaded league',
 'iii',
 'a case of identity',
 'iv',
 'the boscombe valley mystery',
 'v',
 'the five orange pips',
 'vi',
 'the man with the twisted lip',
 'vii'

In [5]:
# with open("hamlet.txt","w") as file:
#     file.write(df)

## Data Preprocessing

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

### Loading the Dataset

In [7]:
# with open("hamlet.txt","r") as file:
#     text = file.read().lower()

## Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1
print("vocabulary size: ",total_words)
tokenizer.word_index

vocabulary size:  8341


{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'a': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'you': 10,
 'he': 11,
 'was': 12,
 'his': 13,
 'is': 14,
 'my': 15,
 'have': 16,
 'as': 17,
 'with': 18,
 'had': 19,
 'at': 20,
 'which': 21,
 'for': 22,
 'but': 23,
 'not': 24,
 'me': 25,
 'be': 26,
 'we': 27,
 'there': 28,
 'from': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'holmes': 33,
 'so': 34,
 'him': 35,
 'her': 36,
 'she': 37,
 'very': 38,
 'your': 39,
 'been': 40,
 'no': 41,
 'all': 42,
 'what': 43,
 'on': 44,
 'one': 45,
 'then': 46,
 'were': 47,
 'by': 48,
 'are': 49,
 'an': 50,
 'would': 51,
 'when': 52,
 'out': 53,
 'up': 54,
 'man': 55,
 'could': 56,
 'has': 57,
 'do': 58,
 'into': 59,
 'mr': 60,
 'who': 61,
 'little': 62,
 'will': 63,
 'if': 64,
 'some': 65,
 'now': 66,
 'down': 67,
 'see': 68,
 'should': 69,
 'our': 70,
 'may': 71,
 'or': 72,
 'they': 73,
 'well': 74,
 'am': 75,
 'us': 76,
 'over': 77,
 'more': 78,
 'think': 79,
 'know': 80,
 'shall': 81,
 'can': 82,
 'about':

### Creating Input Sequences

In [8]:
# input_sequences = []
# for line in text.split("\n"):
#     token_list = tokenizer.texts_to_sequences([line])[0]
#     for i in range(1,len(token_list)):
#         n_gram_sequences = token_list[:i+1]
#         input_sequences.append(n_gram_sequences)

In [9]:
input_sequences = []
for sent in sentences:
    token_list = tokenizer.texts_to_sequences([sent])[0]
    for i in range(2,len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sequences.append(n_gram_sequences)


In [10]:
input_sequences

[[2026, 4408, 1],
 [2026, 4408, 1, 1079],
 [2026, 4408, 1, 1079, 5],
 [2026, 4408, 1, 1079, 5, 124],
 [2026, 4408, 1, 1079, 5, 124, 33],
 [2026, 4408, 1, 1079, 5, 124, 33, 48],
 [526, 2419, 2420],
 [30, 2027, 14],
 [30, 2027, 14, 22],
 [30, 2027, 14, 22, 1],
 [30, 2027, 14, 22, 1, 321],
 [30, 2027, 14, 22, 1, 321, 5],
 [30, 2027, 14, 22, 1, 321, 5, 423],
 [2421, 20, 41],
 [2421, 20, 41, 2028],
 [2421, 20, 41, 2028, 2],
 [2421, 20, 41, 2028, 2, 18],
 [557, 41, 4409],
 [557, 41, 4409, 4410],
 [10, 71, 2422],
 [10, 71, 2422, 9],
 [10, 71, 2422, 9, 207],
 [10, 71, 2422, 9, 207, 9],
 [10, 71, 2422, 9, 207, 9, 118],
 [10, 71, 2422, 9, 207, 9, 118, 72],
 [4411, 9, 266],
 [4411, 9, 266, 1],
 [4411, 9, 266, 1, 2029],
 [4411, 9, 266, 1, 2029, 5],
 [4411, 9, 266, 1, 2029, 5, 1],
 [4411, 9, 266, 1, 2029, 5, 1, 2026],
 [2030, 2423, 3107],
 [18, 30, 2027],
 [18, 30, 2027, 72],
 [18, 30, 2027, 72, 4412],
 [18, 30, 2027, 72, 4412, 20],
 [18, 30, 2027, 72, 4412, 20, 4413],
 [2424, 1, 1079],
 [2424, 1, 

In [11]:
# input_sequences = []
# for line in text.split("\n"):
#     token_list = tokenizer.texts_to_sequences([line])[0]
#     for i in range(0,len(token_list) - 1):
#         for j in range(i+1, len(token_list)):
#             n_gram_sequences = token_list[i:j+1]
#             input_sequences.append(n_gram_sequences)


In [12]:
len(input_sequences)

67463

In [13]:
## Padding the sequences
max_sequence_len = max([ len(i) for i in input_sequences])

In [14]:
max_sequence_len

8

In [15]:
input_sequences = np.array(pad_sequences(input_sequences,
                                         padding = "pre",
                                          maxlen = max_sequence_len)
                                          )
input_sequences

array([[   0,    0,    0, ..., 2026, 4408,    1],
       [   0,    0,    0, ..., 4408,    1, 1079],
       [   0,    0,    0, ...,    1, 1079,    5],
       ...,
       [   0, 8340,  107, ...,    8,   37,   57],
       [8340,  107,    3, ...,   37,   57,  290],
       [   0,    0,    0, ...,   18,  516, 1110]], dtype=int32)

In [16]:
## Creat predictors and labels
import tensorflow as tf
# taking every word except the final word
x, y = input_sequences[:,:-1], input_sequences[:,-1]
x,y

(array([[   0,    0,    0, ...,    0, 2026, 4408],
        [   0,    0,    0, ..., 2026, 4408,    1],
        [   0,    0,    0, ..., 4408,    1, 1079],
        ...,
        [   0, 8340,  107, ...,  316,    8,   37],
        [8340,  107,    3, ...,    8,   37,   57],
        [   0,    0,    0, ...,    0,   18,  516]], dtype=int32),
 array([   1, 1079,    5, ...,   57,  290, 1110], dtype=int32))

In [17]:
from tensorflow.keras.utils import to_categorical
## OneHotEncoding the index value
y = to_categorical(y, num_classes = total_words)
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((53970, 7), (13493, 7), (53970, 8341), (13493, 8341))

In [19]:
## Training the LSTM RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

## Defining the Model
model = Sequential()
model.add(Embedding(total_words, 200, input_length=max_sequence_len-1))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()



In [20]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor = "val_loss",patience = 10, restore_best_weights = True)

In [None]:
## Training the Model
history = model.fit(x_train,
                    y_train,
                    epochs = 100,
                    validation_data = (x_test,y_test),
                    verbose=1,
                    )

Epoch 1/100
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 92ms/step - accuracy: 0.0558 - loss: 6.8399 - val_accuracy: 0.0591 - val_loss: 6.5453
Epoch 2/100
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 89ms/step - accuracy: 0.0547 - loss: 6.4650 - val_accuracy: 0.0591 - val_loss: 6.4473
Epoch 3/100
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 93ms/step - accuracy: 0.0560 - loss: 6.2809 - val_accuracy: 0.0740 - val_loss: 6.4118
Epoch 4/100
[1m1390/1687[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m25s[0m 86ms/step - accuracy: 0.0689 - loss: 6.1120

In [None]:
def predict(model, tokenizer, text, max_sequence_len):
  token_list = tokenizer.texts_to_sequences([text])[0]
  if len(token_list) >= max_sequence_len:
    token_list = token_list[-max_sequence_len:]
  input_sequence = pad_sequences([token_list],padding = "pre",maxlen = max_sequence_len - 1)
  pred = model.predict(input_sequence,verbose = 0)
  predicted_word = np.argmax(pred, axis = 1)
  for word, index in tokenizer.word_index.items():
    if index == predicted_word:
      return word
  return None

In [None]:
input = "I am very"
predict(model, tokenizer, input, max_sequence_len)