### Pip install

In [None]:
pip install torchtext

In [None]:
pip install torch torchvision

In [None]:
pip install nltk

In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

### Import

In [1]:
# Import list

import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
# from sklearn.preprocessing import LabelBinarizer
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import ReduceLROnPlateau
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data 

In [27]:
# Different Editors paths
PATH_A = "data\sherlock-holm.es_stories_plain-text_advs.txt"
PATH_E = ""
PATH_G = "0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"
PATH_J = ""
PATH_M = "Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt"

PATHS = [PATH_A, PATH_E, PATH_G, PATH_J, PATH_M]

text = ""

for path in PATHS:
    try:
        # Read the text file
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
            print(path)
    except:
        continue
    else:
        break



print(text[:200])

data\sherlock-holm.es_stories_plain-text_advs.txt




                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohem


## Data Preprocess

### Divide the set

In [100]:
# Divide with regular expressions
DIVIDERS_ORIGINAL = "\n"
DIVIDERS_ALL = "[,.!?:;\"]|\n\n|--| and | that | which "
DIVIDERS_MIN = "[.!]|\n\n"
DIVIDERS_BAL = "[,.!?]|\n\n|--"
CLEAR_COVER = False

text_try = text.lower()

if CLEAR_COVER:
    # Delete cover of book and extra information
    text_try = text[980:-550]
    

# Split following the dividers given
text_try = re.split(DIVIDERS_MIN, text_try)

# Delete all the new line comments 
text_try = [el.replace('\n', '') for el in text_try]

text_try[:10]

['',
 '',
 '                        the adventures of sherlock holmes',
 '                               arthur conan doyle',
 '',
 '                                table of contents',
 "               a scandal in bohemia               the red-headed league               a case of identity               the boscombe valley mystery               the five orange pips               the man with the twisted lip               the adventure of the blue carbuncle               the adventure of the speckled band               the adventure of the engineer's thumb               the adventure of the noble bachelor               the adventure of the beryl coronet               the adventure of the copper beeches",
 '',
 '',
 '']

### Tokenization

In [101]:
# Create Tokenizer object in python
CLEAR_COVER
tokens = word_tokenize(text)
vocabulary = set(tokens)
total_words = len(vocabulary) + 1

word_to_idx = {word:idx for idx, word in enumerate(vocabulary)}

print(f"total_words: {total_words}")
print(f"Índice de palabras: {word_to_idx}")

total_words: 8965


In [103]:
# Create input-output sequences
input_sequences = []
for line in text_try:
    line_list = line.rstrip(",.;:").split(' ')

    # Tokenize each sentence
    token_list = []
    for char in line_list:
        if char in word_to_idx.keys():
            token_list.append(word_to_idx[char])

    # Divide the different sentences in n-grams
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences


[[1693, 8438],
 [1693, 8438, 4756],
 [6394, 4756],
 [6394, 4756, 2461],
 [1568, 7284],
 [1568, 7284, 8280],
 [1568, 7284, 8280, 1693],
 [1568, 7284, 8280, 1693, 4619],
 [1568, 7284, 8280, 1693, 4619, 5130],
 [1568, 7284, 8280, 1693, 4619, 5130, 1568],
 [1568, 7284, 8280, 1693, 4619, 5130, 1568, 5788],
 [1568, 7284, 8280, 1693, 4619, 5130, 1568, 5788, 4756],
 [1568, 7284, 8280, 1693, 4619, 5130, 1568, 5788, 4756, 3102],
 [1568, 7284, 8280, 1693, 4619, 5130, 1568, 5788, 4756, 3102, 1693],
 [1568, 7284, 8280, 1693, 4619, 5130, 1568, 5788, 4756, 3102, 1693, 7989],
 [1568,
  7284,
  8280,
  1693,
  4619,
  5130,
  1568,
  5788,
  4756,
  3102,
  1693,
  7989,
  1693],
 [1568,
  7284,
  8280,
  1693,
  4619,
  5130,
  1568,
  5788,
  4756,
  3102,
  1693,
  7989,
  1693,
  5523],
 [1568,
  7284,
  8280,
  1693,
  4619,
  5130,
  1568,
  5788,
  4756,
  3102,
  1693,
  7989,
  1693,
  5523,
  4081],
 [1568,
  7284,
  8280,
  1693,
  4619,
  5130,
  1568,
  5788,
  4756,
  3102,
  1693,
  7989

In [6]:
import torch
torch.cuda.is_available()



True

### Padding

In [104]:
# Get the max value to add padding to other entries

average = 0
for seq in input_sequences:
    average += len(seq) 

max_sequence_len, value = max([(len(seq), seq) for seq in input_sequences])
input_seq_pad = np.array([np.pad(seq, (max_sequence_len - len(seq), 0), mode='constant') for seq in input_sequences])

print (f"average = {average / len(input_sequences)}")
print (f"Max seq length = {max_sequence_len}")
input_seq_pad

average = 12.365409113179814
Max seq length = 89


array([[   0,    0,    0, ...,    0, 1693, 8438],
       [   0,    0,    0, ..., 1693, 8438, 4756],
       [   0,    0,    0, ...,    0, 6394, 4756],
       ...,
       [   0,    0,    0, ..., 1829, 2174, 1693],
       [   0,    0,    0, ..., 2174, 1693, 2364],
       [   0,    0,    0, ..., 1693, 2364, 7944]])

## Model Train

### X and Y separation

In [105]:
# Split the sequences into input (X) and output (y)
X = input_seq_pad[:, :-1]
y = input_seq_pad[:, -1]

y_tensor = torch.tensor(y, dtype=torch.int64)
y = F.one_hot(y_tensor, num_classes=total_words)


In [106]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

  self.y = torch.tensor(y, dtype=torch.long)


## MODELS

### Multiple LSTM layers

In [107]:
# Define the model
class NextWordPredictor(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.2):
    super(NextWordPredictor, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.dropout_embed = nn.Dropout(dropout)  # Add dropout after embedding
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True)  # Use Bidirectional LSTM with multiple layers
    self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Adjust output size for Bidirectional LSTM

  def forward(self, sequences):
    embedded = self.embedding(sequences)
    embedded = self.dropout_embed(embedded)
    lstm_out, _ = self.lstm(embedded)
    last_hidden = lstm_out[:, -1, :]  # Select last hidden state from the sequence
    logits = self.fc(last_hidden)
    return logits



### Simple LSTM

In [92]:
# Train the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim,dropout=0.2):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout_embed = nn.Dropout(dropout)  # Add dropout after embedding
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout_embed(embedded)
        x, _ = self.lstm(embedded)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

In [112]:
model = NextWordPredictor(vocab_size = total_words, 
embed_dim = 200, hidden_dim = 256, output_dim = total_words).cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



### Training

In [113]:
# Coda setting
import torch
print(f"Is coda avialable: {torch.cuda.is_available()}")

a=torch.FloatTensor([1.0,2.0]).cuda()
a.device
torch.manual_seed(32)


next(model.parameters()).is_cuda
gpumodel = model.cuda()

Is coda avialable: True


In [114]:
SAVE_MODEL = True  #If set to True, saves a torch_model.pt 

epochs = 200
patience = 10  # Number of epochs to wait for improvement


current_patience = patience
best_loss = float('inf')  # Initialize best loss to a very high value
better_model = model
for epoch in range(epochs):
  # Training loop
  for i, (X_batch, y_batch) in enumerate(dataloader):
    X_batch, y_batch = X_batch.cuda(), y_batch.cuda()  # Mover datos a la GPU
    outputs = model(X_batch)
    loss = criterion(outputs, y_batch.argmax(dim=1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # Early stopping
  if loss.item() < best_loss:  # Compare current training loss with best loss
    best_loss = loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()} (Improved)')
    better_model = model
    current_patience = patience  # Restart patience
  else:
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
    current_patience -= 1  # Decrement patience counter on no improvement

  # Stop training if patience is 0
  if current_patience == 0:
    print('Early stopping triggered!')
    break

#Saving model to .pt
if SAVE_MODEL == True:
  torch.save(better_model.state_dict(), 'best_model_simple_LSTM.pt')

Epoch 1/200, Loss: 5.561131477355957 (Improved)
Epoch 2/200, Loss: 4.805379390716553 (Improved)
Epoch 3/200, Loss: 5.481302738189697
Epoch 4/200, Loss: 4.258865833282471 (Improved)
Epoch 5/200, Loss: 3.5373542308807373 (Improved)
Epoch 6/200, Loss: 3.8798840045928955
Epoch 7/200, Loss: 3.9263815879821777
Epoch 8/200, Loss: 3.9889678955078125
Epoch 9/200, Loss: 3.2904815673828125 (Improved)
Epoch 10/200, Loss: 2.624607801437378 (Improved)
Epoch 11/200, Loss: 2.605278730392456 (Improved)
Epoch 12/200, Loss: 2.7201340198516846
Epoch 13/200, Loss: 2.1881327629089355 (Improved)
Epoch 14/200, Loss: 2.1261088848114014 (Improved)
Epoch 15/200, Loss: 2.354800224304199
Epoch 16/200, Loss: 2.1975035667419434
Epoch 17/200, Loss: 2.5219204425811768
Epoch 18/200, Loss: 1.7705278396606445 (Improved)
Epoch 19/200, Loss: 1.7709909677505493
Epoch 20/200, Loss: 1.7272584438323975 (Improved)
Epoch 21/200, Loss: 1.7711299657821655
Epoch 22/200, Loss: 1.7164121866226196 (Improved)
Epoch 23/200, Loss: 1.2957

## Prediction

In [115]:
# Initial text to predict
seed_text = "I am"
next_words = 15

# Index to word
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

device = torch.device('cpu')
better_model.to(device)  # Asegurarse de que el modelo esté en el dispositivo correcto
# Generate the n next words
better_model.eval()  # Set the model to evaluation
for _ in range(next_words):
    tokens = word_tokenize(seed_text)
    token_list = [word_to_idx[word] for word in tokens if word in word_to_idx]
    token_list = np.pad(token_list, (max_sequence_len - len(token_list), 0), mode='constant')
    token_list = torch.tensor(token_list[-max_sequence_len:], dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        predicted = better_model(token_list).argmax(dim=1).item()

    output_word = idx_to_word[predicted]
    seed_text += " " + output_word


print(seed_text)

I am left save the small estate of hereditary kings of your though presume that my companion
