In [2]:
# Import list

import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import ReduceLROnPlateau
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data 

In [3]:
# Different Editors paths
PATH_A = "data\sherlock-holm.es_stories_plain-text_advs.txt"
PATH_E = ""
PATH_G = "0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"
PATH_J = ""
PATH_M = "Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"

PATHS = [PATH_A, PATH_E, PATH_G, PATH_J, PATH_M]

text = ""

for path in PATHS:
    try:
        # Read the text file
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
            print(path)
    except:
        continue
    else:
        break



print(text[:200])

0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt




                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohem


## Data Preprocess

### Divide the set

In [4]:
# Divide with regular expressions
DIVIDERS_ORIGINAL = "\n"
DIVIDERS_ALL = "[,.!?:;\"]|\n\n|--| and | that | which "
DIVIDERS_MIN = "[.!]|\n\n"
DIVIDERS_BAL = "[,.!?]|\n\n|--"
CLEAR_COVER = False

text_try = text.lower()

if CLEAR_COVER:
    # Delete cover of book and extra information
    text_try = text[980:-550]
    

# Split following the dividers given
text_try = re.split(DIVIDERS_ORIGINAL, text_try)

# Delete all the new line comments 
text_try = [el.replace('\n', '') for el in text_try]

text_try[:10]

['',
 '',
 '',
 '',
 '                        the adventures of sherlock holmes',
 '',
 '                               arthur conan doyle',
 '',
 '',
 '']

### Tokenization

In [5]:
# Create Tokenizer object in python
CLEAR_COVER
tokens = word_tokenize(text)
vocabulary = set(tokens)
total_words = len(vocabulary) + 1

word_to_idx = {word:idx for idx, word in enumerate(vocabulary)}

print(f"total_words: {total_words}")
print(f"Índice de palabras: {word_to_idx}")

total_words: 8965


In [6]:
# Create input-output sequences
input_sequences = []
for line in text_try:
    line_list = line.rstrip(",.;:").split(' ')

    # Tokenize each sentence
    token_list = []
    for char in line_list:
        if char in word_to_idx.keys():
            token_list.append(word_to_idx[char])

    # Divide the different sentences in n-grams
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences

[[4100, 7925],
 [4100, 7925, 1579],
 [7799, 1579],
 [7799, 1579, 3124],
 [8877, 6979],
 [8877, 6979, 3390],
 [4100, 6341],
 [4100, 6341, 7767],
 [8877, 3572],
 [8877, 3572, 1579],
 [8877, 3572, 1579, 383],
 [4100, 5715],
 [4100, 8230],
 [4100, 8230, 1184],
 [4100, 8230, 1184, 714],
 [4100, 6232],
 [4100, 6232, 6434],
 [4100, 6232, 6434, 4100],
 [4100, 6232, 6434, 4100, 942],
 [4100, 6232, 6434, 4100, 942, 7232],
 [4100, 4537],
 [4100, 4537, 1579],
 [4100, 4537, 1579, 4100],
 [4100, 4537, 1579, 4100, 4236],
 [4100, 4537, 1579, 4100, 4236, 6645],
 [4100, 4537],
 [4100, 4537, 1579],
 [4100, 4537, 1579, 4100],
 [4100, 4537, 1579, 4100, 7656],
 [4100, 4537, 1579, 4100, 7656, 3934],
 [4100, 4537],
 [4100, 4537, 1579],
 [4100, 4537, 1579, 4100],
 [4100, 4537, 1579, 4100, 4798],
 [4100, 4537],
 [4100, 4537, 1579],
 [4100, 4537, 1579, 4100],
 [4100, 4537, 1579, 4100, 4359],
 [4100, 4537, 1579, 4100, 4359, 7806],
 [4100, 4537],
 [4100, 4537, 1579],
 [4100, 4537, 1579, 4100],
 [4100, 4537, 1579, 

### Padding

In [7]:
# Get the max value to add padding to other entries

average = 0
for seq in input_sequences:
    average += len(seq) 

max_sequence_len, value = max([(len(seq), seq) for seq in input_sequences])
input_seq_pad = np.array([np.pad(seq, (max_sequence_len - len(seq), 0), mode='constant') for seq in input_sequences])

print (f"average = {average / len(input_sequences)}")
print (f"Max seq length = {max_sequence_len}")
input_seq_pad

average = 6.213641593749164
Max seq length = 16


array([[   0,    0,    0, ...,    0, 4100, 7925],
       [   0,    0,    0, ..., 4100, 7925, 1579],
       [   0,    0,    0, ...,    0, 7799, 1579],
       ...,
       [   0,    0,    0, ..., 6553, 2428, 4100],
       [   0,    0,    0, ..., 2428, 4100, 8779],
       [   0,    0,    0, ..., 4100, 8779, 4116]])

## Model Train

### X and Y separation

In [8]:
# Split the sequences into input (X) and output (y)
X = input_seq_pad[:, :-1]
y = input_seq_pad[:, -1]

# Convert output to one-hot encoded vectors
y = np.array(torch.nn.functional.one_hot(torch.tensor(y), num_classes=total_words))

print (X)
y

[[   0    0    0 ...    0    0 4100]
 [   0    0    0 ...    0 4100 7925]
 [   0    0    0 ...    0    0 7799]
 ...
 [   0    0    0 ... 5995 6553 2428]
 [   0    0    0 ... 6553 2428 4100]
 [   0    0    0 ... 2428 4100 8779]]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)
        

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

## MODELS

### Multiple LSTM layers

In [13]:
# Define the model
class NextWordPredictor(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.2, temperature=1):
    super(NextWordPredictor, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.dropout_embed = nn.Dropout(dropout)  # Add dropout after embedding
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True)  # Use Bidirectional LSTM with multiple layers
    self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Adjust output size for Bidirectional LSTM
    self.softmax = nn.Softmax(dim=1)  # Add a softmax layer
    self.T = temperature

  def forward(self, sequences):
    embedded = self.embedding(sequences)
    embedded = self.dropout_embed(embedded)
    lstm_out, _ = self.lstm(embedded)
    last_hidden = lstm_out[:, -1, :]  # Select last hidden state from the sequence
    logits = self.fc(last_hidden)
    logits = self.softmax(logits/self.T)  # Apply softmax
    return logits



### Simple LSTM

In [9]:
# Train the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, temperature=1):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)  # Add a softmax layer
        self.T = temperature

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        x = self.softmax(x/self.T)  # Apply softmax
        return x

### Model creation


In [14]:
model = NextWordPredictor(vocab_size = total_words, 
embed_dim = 100, hidden_dim = 150, output_dim = total_words, temperature=3)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


## TRAINING

In [15]:
SAVE_MODEL = True  #If set to True, saves a torch_model.pt 
SAVED_MODEL_NAME = '[name].pt'

epochs = 200
patience = 3  # Number of epochs to wait for improvement
current_patience = patience
best_loss = float('inf')  # Initialize best loss to a very high value
better_model = model
for epoch in range(epochs):
  # Training loop
  for i, (inputs, labels) in enumerate(dataloader):
    outputs = model(inputs)
    loss = criterion(outputs, labels.argmax(dim=1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # Early stopping
  if loss.item() < best_loss:  # Compare current training loss with best loss
    best_loss = loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()} (Improved)')
    better_model = model
    current_patience = patience  # Restart patience
  else:
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
    current_patience -= 1  # Decrement patience counter on no improvement

  # Stop training if patience is 0
  if current_patience == 0:
    print('Early stopping triggered!')
    break



#Saving model to .pt
if SAVE_MODEL == True:
  torch.save(better_model, SAVED_MODEL_NAME)



Epoch 1/200, Loss: 8.990187644958496 (Improved)
Epoch 2/200, Loss: 9.027204513549805


### Load model

In [23]:
# If model loaded do not run training
MODEL_PATH = 'best_model.pt'

better_model = torch.load(MODEL_PATH) 



## PREDICTION

In [28]:
# Initial text to predict
seed_text = "I am"
next_words = 10

# Index to word
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Generate the n next words
better_model.eval()  # Set the model to evaluation
for _ in range(next_words):
    tokens = word_tokenize(seed_text)
    token_list = [word_to_idx[word] for word in tokens if word in word_to_idx]
    token_list = np.pad(token_list, (max_sequence_len - len(token_list), 0), mode='constant')
    token_list = torch.tensor(token_list[-max_sequence_len:], dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        predicted = better_model(token_list).argmax(dim=1).item()

    output_word = idx_to_word[predicted]
    seed_text += " " + output_word


print(seed_text)

I am runing with the so that it may have not been but the
