In [1]:
# Import list

import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import ReduceLROnPlateau
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import functions as func
import constants as const
import controlVariables as convar
from NextWordPredictorModel import NextWordPredictor

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data 

In [2]:
# Read text from file
text = func.read_text_list(const.PATHS)

print (f"{const.bcolors.FAIL}Error while reading{const.bcolors.ENDC}" if text == -1 else text[:200])

0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt




                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohem


## Data Preprocess

### Divide the set

In [3]:
# Delete cover and divide text

text = text.lower()

no_cover_text = func.clean_cover_text_str(text, convar.clean_cover)
text_divided = func.divide_text_str(no_cover_text, convar.dividers)

print(text_divided[:10])

['', '', '', '', '                        the adventures of sherlock holmes', '', '                               arthur conan doyle', '', '', '']


### Tokenization

In [4]:
# Create Tokenizer object in python

vocabulary, word_to_idx = func.tokenize_text(func.clean_text(no_cover_text, const.CLEAN_CHARS))
total_words = len(vocabulary) + 1

print(f"total_words: {total_words}")
print(f"Índice de palabras: {word_to_idx}")

total_words: 8299


In [5]:
# Create the different n-gram sequences
input_sequences = func.n_gram_separation(text_divided, const.CLEAN_CHARS, word_to_idx)

input_sequences[:20]

[[2303, 7778],
 [2303, 7778, 2772],
 [2303, 7778, 2772, 1567],
 [2303, 7778, 2772, 1567, 8209],
 [1286, 7029],
 [1286, 7029, 324],
 [217, 2772],
 [217, 2772, 23],
 [2559, 1498],
 [2559, 1498, 5469],
 [2559, 1498, 5469, 3403],
 [2303, 1992],
 [2303, 1992, 1279],
 [2559, 7154],
 [2559, 7154, 2772],
 [2559, 7154, 2772, 3244],
 [2303, 1717],
 [2303, 1717, 3420],
 [2303, 1717, 3420, 7357],
 [2303, 8032]]

### Padding

In [6]:
# Get the max value to add padding to other entries
input_seq_pad = func.pad_sequences(input_sequences)

# Metrics printing
average = 0
for seq in input_sequences:
    average += len(seq)
max_sequence_len = max([len(seq) for seq in input_sequences])

print (f"Length average = {average / len(input_sequences)}")
print (f"Max seq length = {max_sequence_len}")
input_seq_pad

Length average = 7.159356725146199
Max seq length = 18


array([[   0,    0,    0, ...,    0, 2303, 7778],
       [   0,    0,    0, ..., 2303, 7778, 2772],
       [   0,    0,    0, ..., 7778, 2772, 1567],
       ...,
       [   0,    0,    0, ..., 6801, 2303, 4143],
       [   0,    0,    0, ..., 2303, 4143, 2067],
       [   0,    0,    0, ..., 4143, 2067, 2623]])

## Model Train

### X and Y separation

In [7]:
# Split the sequences into input (X) and output (y)
X, y = func.split_xy(input_seq_pad, total_words, convar.gpu_running)

print (X)
y

[[   0    0    0 ...    0    0 2303]
 [   0    0    0 ...    0 2303 7778]
 [   0    0    0 ... 2303 7778 2772]
 ...
 [   0    0    0 ... 7094 6801 2303]
 [   0    0    0 ... 6801 2303 4143]
 [   0    0    0 ... 2303 4143 2067]]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:

dataset = const.TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

### Model creation


In [9]:
if convar.gpu_running:
    model = NextWordPredictor(
        vocab_size = total_words, 
        embed_dim = convar.embed_dim, 
        hidden_dim = convar.hidden_dim, 
        output_dim = total_words, 
        lstm_layers=convar.lstm_layers, 
        dropout=convar.dropout,
        temperature=convar.temperature
        ).cuda()
else:
    model = NextWordPredictor(
        vocab_size = total_words, 
        embed_dim = convar.embed_dim, 
        hidden_dim = convar.hidden_dim, 
        output_dim = total_words, 
        lstm_layers=convar.lstm_layers, 
        dropout=convar.dropout,
        temperature=convar.temperature
        )

criterion = convar.criterion
optimizer = optim.Adam(model.parameters(), lr=convar.lr)



### TRAINING

In [10]:
if convar.gpu_running:
  print(f"Is coda avialable: {torch.cuda.is_available()}")

better_model = func.training_model(model, dataloader, criterion, optimizer, convar.epochs, convar.patience, convar.gpu_running)
#Saving model to .pt
if convar.save_model == True:
  torch.save(better_model, const.MODEL_NAME)

Epoch 1/5, Loss: 9.023683547973633 (Improved)
Epoch 2/5, Loss: 8.91305923461914 (Improved)
Epoch 3/5, Loss: 8.941709518432617
Epoch 4/5, Loss: 8.857274055480957 (Improved)
Epoch 5/5, Loss: 8.827615737915039 (Improved)


## PREDICTION

### Load model

In [11]:
# If model loaded do not run training

better_model = torch.load(const.MODEL_NAME, map_location=torch.device('cpu')) 



In [12]:
# Predict text
if convar.gpu_running:
    device = torch.device('cpu')
    better_model.to(device)  # Asegurarse de que el modelo esté en el dispositivo correcto

seed_text = func.predict_model (better_model, convar.next_words, convar.seed_text, word_to_idx, max_sequence_len)

print(seed_text)

I am not the door and i was the door and i
