In [None]:
import regex as re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
from tensorflow.keras.utils import pad_sequences, to_categorical

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def remove_unwanted_characters(txt) -> str:
  newline_str = r"\n+"
  whitespace_str = r"\t|\r"
  quote_str = r"“|”|‘|’"
  roman_numeral_str = r"(XI{0,2}\.)|(VI{0,3}\.)|(IV|IX|I{1,3}\.)"


  txt = re.sub(newline_str, " ", txt)
  txt = re.sub(whitespace_str, "", txt)
  txt = re.sub(quote_str, "", txt)
  txt = re.sub(roman_numeral_str, "", txt)

  return txt

In [None]:
def file_to_sentences(FILE_PATH) -> list:
  with open(FILE_PATH, "r") as file:
    txt = remove_unwanted_characters(file.read())
    # Split into sentences
    sentences = sent_tokenize(txt)

    return sentences


In [None]:
FILE_PATH = "../data/sherlock_holmes_text.txt"

sentences = file_to_sentences(FILE_PATH)

In [None]:
sentences = sentences[4:] # Crops out the preface

In [None]:
sentences = [word_tokenize(sent) for sent in sentences]
sentences[1]

['In',
 'his',
 'eyes',
 'she',
 'eclipses',
 'and',
 'predominates',
 'the',
 'whole',
 'of',
 'her',
 'sex',
 '.']

In [None]:
all_words = [word for sentence in sentences for word in sentence]
vocabulary = set(all_words)

In [None]:
# This is how enumerate works
# We can use enumerate to create our word:idx mapping
list(enumerate(vocabulary, 1))[0:10]

[(1, 'greater'),
 (2, 'borders'),
 (3, 'averse'),
 (4, 'obtaining'),
 (5, 'even'),
 (6, 'exit'),
 (7, 'Maggie'),
 (8, 'servants—a'),
 (9, 'personate'),
 (10, 'aunt')]

In [None]:
# Using a list comprehension, we can loop through each (idx, word) pair from enumerate
# Each pair holds the key and value we want in our word_to_idx dict
# We start the enumeration from 1, not 0, because want 0 to represent the padding token
word_to_idx = {word : idx for idx, word in enumerate(vocabulary, 1)}
# Let's also create a idx_to_word dict so we can interpet the results of the model later
idx_to_word = {idx : word for word, idx in word_to_idx.items()}
vocab_size = len(vocabulary) + 1

In [None]:
input_sequences = []
for sentence in sentences:
  # Convert the sentence to its numerical representation with the word_to_idx mapping
  numerized_sentence = [word_to_idx[word] for word in sentence]
  # Create ngrams from size 2 to the size of the sentence
  for i in range(2, len(sentence) + 1):
    ngram = numerized_sentence[:i]
    input_sequences.append(ngram)

In [None]:
input_sequences[5:10]

[[6289, 5601, 7877, 2611, 7934, 2018, 8284],
 [6289, 5601, 7877, 2611, 7934, 2018, 8284, 6925],
 [6289, 5601, 7877, 2611, 7934, 2018, 8284, 6925, 3097],
 [6289, 5601, 7877, 2611, 7934, 2018, 8284, 6925, 3097, 8863],
 [6289, 5601, 7877, 2611, 7934, 2018, 8284, 6925, 3097, 8863, 2940]]

In [None]:
# Now, let's pad the sequences so they are all the same length
max_sequence_len = max([len(seq) for seq in input_sequences])
# pad_sequences adds 0s to the beginning of each array until size(vector) = max_sequence_len
# This is why we started our enumeration from 1, not 0, because 0 represents the padding token
# We use pre padding because padding at the end would cause us to lose the location of the label
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [None]:
X = [sequence[:-1] for sequence in input_sequences]
y = [sequence[-1] for sequence in input_sequences]
y = to_categorical(y, num_classes=vocab_size)

## Building the Model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, SpatialDropout1D, GaussianNoise
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

In [None]:
# Building the RNN model
model = Sequential()

# Embedding layer
model.add(Embedding(vocab_size, 400, input_length=max_sequence_len-1))  # Increased embedding dimensions
model.add(SpatialDropout1D(0.25))
model.add(GaussianNoise(0.1))

# RNN 1
model.add(LSTM(512, dropout=0.25, recurrent_dropout=0.25))  # Increased units, added dropout
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Final Layer
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(optimizer=Adam(lr=0.01, clipnorm=1.0), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 122, 400)          3785600   
                                                                 
 spatial_dropout1d (Spatial  (None, 122, 400)          0         
 Dropout1D)                                                      
                                                                 
 gaussian_noise (GaussianNo  (None, 122, 400)          0         
 ise)                                                            
                                                                 
 lstm (LSTM)                 (None, 512)               1869824   
                                                                 
 batch_normalization (Batch  (None, 512)               2048      
 Normalization)                                                  
                                                        

In [None]:
# Stop the model early
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=1, restore_best_weights=True)
# OPTIONAL: Reduce learning rate when the model stops improving, can help the gradient descent get out of local minima
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.001, verbose=1)

In [None]:
# This will train the model; adjust epochs and batch size as necessary
history = model.fit(X, y, epochs=200, batch_size=32, verbose=1, validation_split=0.2, callbacks=[early_stop])

In [None]:
model.save('../exports/sherlock_model.h5')

In [None]:
from tensorflow.keras.models import load_model

In [None]:
model = load_model('../exports/sherlock_model.h5')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 122, 400)          3785600   
                                                                 
 spatial_dropout1d (Spatial  (None, 122, 400)          0         
 Dropout1D)                                                      
                                                                 
 gaussian_noise (GaussianNo  (None, 122, 400)          0         
 ise)                                                            
                                                                 
 lstm (LSTM)                 (None, 512)               1869824   
                                                                 
 batch_normalization (Batch  (None, 512)               2048      
 Normalization)                                                  
                                                        

In [None]:
import numpy as np

def predict_next_word(model, text, max_sequence_len, word_to_index, index_to_word):
    """
    Predict the next word based on the input text.

    Args:
    - model (tf.keras.Model): Trained model for prediction.
    - text (str): Input string.
    - max_sequence_len (int): Maximum length of input sequences.
    - word_to_index (dict): Mapping from words to their respective indices.
    - index_to_word (dict): Mapping from indices to their respective words.

    Returns:
    - str: Predicted word.
    """

    # Tokenize the input string
    token_list = [word_to_index[word] for word in word_tokenize(text) if word in word_to_index]

    # Pad the token sequence
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Predict the token of the next word
    predicted_idx = np.argmax(model.predict(token_list), axis=-1)

    # Convert the token back to a word
    predicted_word = index_to_word.get(predicted_idx[0], '')

    return predicted_word

In [None]:
def predict_next_n_words(model, text, n, max_sequence_len, word_to_index, index_to_word):
    """
    Predict the next n words based on the input text.

    Args:
    - model (tf.keras.Model): Trained model for prediction.
    - text (str): Input string.
    - n (int): Number of words to predict.
    - max_sequence_len (int): Maximum length of input sequences.
    - word_to_index (dict): Mapping from words to their respective indices.
    - index_to_word (dict): Mapping from indices to their respective words.

    Returns:
    - str: Predicted sequence of words.
    """

    predicted_sequence = []

    for _ in range(n):
        # Tokenize the input string
        token_list = [word_to_index[word] for word in word_tokenize(text) if word in word_to_index]

        # Pad the token sequence
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Predict the token of the next word
        predicted_idx = np.argmax(model.predict(token_list), axis=-1)

        # Convert the token back to a word
        predicted_word = index_to_word.get(predicted_idx[0], '')

        # Append the predicted word to the sequence and to the text (for the next iteration)
        predicted_sequence.append(predicted_word)
        text += " " + predicted_word

    return ' '.join(predicted_sequence)

In [None]:
input_text = "Sherlock said the biggest problem"
prediction = predict_next_word(model, input_text, max_sequence_len, word_to_idx, idx_to_word)
print(input_text + " " + prediction)

Sherlock said the biggest problem bequeathed
