In [352]:
import numpy as np
import pandas as pd
# Read the text file
with open('sherlock_holmes_small.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [353]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [354]:
tokenizer = Tokenizer()

In [355]:
tokenizer.fit_on_texts([text])

In [356]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'his': 4,
 'to': 5,
 'a': 6,
 'in': 7,
 'i': 8,
 'was': 9,
 'he': 10,
 'for': 11,
 'own': 12,
 'which': 13,
 'holmes': 14,
 'that': 15,
 'all': 16,
 'but': 17,
 'as': 18,
 'from': 19,
 'had': 20,
 'my': 21,
 'woman': 22,
 'one': 23,
 'were': 24,
 'with': 25,
 'up': 26,
 'sherlock': 27,
 'she': 28,
 'have': 29,
 'heard': 30,
 'him': 31,
 'her': 32,
 'any': 33,
 'other': 34,
 'whole': 35,
 'it': 36,
 'not': 37,
 'emotion': 38,
 'irene': 39,
 'adler': 40,
 'seen': 41,
 'would': 42,
 'himself': 43,
 'such': 44,
 'nature': 45,
 'little': 46,
 'who': 47,
 'week': 48,
 'by': 49,
 'those': 50,
 'clearing': 51,
 'time': 52,
 'adventures': 53,
 'is': 54,
 'always': 55,
 'seldom': 56,
 'mention': 57,
 'under': 58,
 'name': 59,
 'eyes': 60,
 'eclipses': 61,
 'predominates': 62,
 'sex': 63,
 'felt': 64,
 'akin': 65,
 'love': 66,
 'emotions': 67,
 'particularly': 68,
 'abhorrent': 69,
 'cold': 70,
 'precise': 71,
 'admirably': 72,
 'balanced': 73,
 'mind': 74,
 'take

In [357]:
len(tokenizer.word_index)

245

In [358]:
input_sequence = []
for sentence in text.split('\n'):
    # //print(sentence)
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1,len(tokenized_sentence)):
        input_sequence.append(tokenized_sentence[:i+1])

In [359]:
input_sequence

[[1, 53],
 [1, 53, 2],
 [1, 53, 2, 27],
 [1, 53, 2, 27, 14],
 [5, 27],
 [5, 27, 14],
 [5, 27, 14, 28],
 [5, 27, 14, 28, 54],
 [5, 27, 14, 28, 54, 55],
 [5, 27, 14, 28, 54, 55, 1],
 [5, 27, 14, 28, 54, 55, 1, 22],
 [5, 27, 14, 28, 54, 55, 1, 22, 8],
 [5, 27, 14, 28, 54, 55, 1, 22, 8, 29],
 [5, 27, 14, 28, 54, 55, 1, 22, 8, 29, 56],
 [5, 27, 14, 28, 54, 55, 1, 22, 8, 29, 56, 30],
 [5, 27, 14, 28, 54, 55, 1, 22, 8, 29, 56, 30, 31],
 [57, 32],
 [57, 32, 58],
 [57, 32, 58, 33],
 [57, 32, 58, 33, 34],
 [57, 32, 58, 33, 34, 59],
 [57, 32, 58, 33, 34, 59, 7],
 [57, 32, 58, 33, 34, 59, 7, 4],
 [57, 32, 58, 33, 34, 59, 7, 4, 60],
 [57, 32, 58, 33, 34, 59, 7, 4, 60, 28],
 [57, 32, 58, 33, 34, 59, 7, 4, 60, 28, 61],
 [57, 32, 58, 33, 34, 59, 7, 4, 60, 28, 61, 3],
 [62, 1],
 [62, 1, 35],
 [62, 1, 35, 2],
 [62, 1, 35, 2, 32],
 [62, 1, 35, 2, 32, 63],
 [62, 1, 35, 2, 32, 63, 36],
 [62, 1, 35, 2, 32, 63, 36, 9],
 [62, 1, 35, 2, 32, 63, 36, 9, 37],
 [62, 1, 35, 2, 32, 63, 36, 9, 37, 15],
 [62, 1, 35, 2

The sequences are not in same length hence we apply zero padding

In [360]:
max_len = max([len(x) for x in input_sequence])

In [361]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequence,maxlen = max_len,padding='pre')

In [362]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,   1,  53],
       [  0,   0,   0, ...,   1,  53,   2],
       [  0,   0,   0, ...,  53,   2,  27],
       ...,
       [  0,  25,  16, ...,   2,  21, 243],
       [  0,   0,   0, ...,   0, 244,   3],
       [  0,   0,   0, ..., 244,   3, 245]], dtype=int32)

In [363]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]

In [364]:
total_words = len(tokenizer.word_index)+1 #unique words


In [365]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [366]:
y.shape

(397, 246)

In [367]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [368]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [371]:
model = Sequential()
model.add(Embedding(total_words,100,input_length = max_len-1))
model.add(LSTM(150))
model.add(Dense(total_words,activation='softmax'))

In [373]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.9643 - loss: 0.6496
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9737 - loss: 0.5754 
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.9890 - loss: 0.5018 
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9829 - loss: 0.4669
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.9815 - loss: 0.4505
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.9822 - loss: 0.4041
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.9816 - loss: 0.3890 
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.9731 - loss: 0.3671 
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1de9bf48830>

In [374]:
model.summary()

In [376]:
ip = 'Mr. Holmes'
for i in range(10):
    # tokenize the current text
    token_text = tokenizer.texts_to_sequences([ip])[0]
    # pad the sequence
    padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
    # predict probabilities
    pred_probs = model.predict(padded_token_text, verbose=0)[0]
    # choose next word using np.argmax (or sampling)
    next_index = np.argmax(pred_probs)
    
    # convert index to word
    for word, index in tokenizer.word_index.items():
        if index == next_index:
            ip += " " + word
            print(ip)


Mr. Holmes placed
Mr. Holmes placed himself
Mr. Holmes placed himself in
Mr. Holmes placed himself in a
Mr. Holmes placed himself in a nature
Mr. Holmes placed himself in a nature such
Mr. Holmes placed himself in a nature such as
Mr. Holmes placed himself in a nature such as his
Mr. Holmes placed himself in a nature such as his activity
Mr. Holmes placed himself in a nature such as his activity however
