In [1]:
with open("data/Sherlock_Holmes.txt", "r", encoding="utf-8") as f:
    corpus = [line for line in f.read().splitlines() if line.strip()] # Storing sentences of the dataset in a list

In [2]:
import keras
from tensorflow.keras.preprocessing.text import Tokenizer # Importing the Tokenizer class from Keras

In [3]:
tokenizer = Tokenizer() # Initializing the tokenizer

In [4]:
tokenizer.fit_on_texts(corpus) # Fitting the tokenizer on the corpus to create a word index

In [5]:
# Creating input sequences for the model
input_sequences = []

for sentence in corpus:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i + 1])

In [6]:
max_len = max([len(x) for x in input_sequences]) # Finding the maximum length of the input sequences to pad them to the same length

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding="pre") # Padding the input sequences to the same length using pre-padding

In [8]:
# Splitting the padded input sequences into features (X) and labels (y)
X = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]

In [9]:
from tensorflow.keras.utils import to_categorical
vocab_size = len(tokenizer.word_index) + 1 # Calculating the vocabulary size
y = to_categorical(y, num_classes=vocab_size) # One-hot encoding the labels