In [1]:
# Import list

import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import ReduceLROnPlateau
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import functions as func
import constants as const
import controlVariables as convar
from NextWordPredictorModel import NextWordPredictor

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data 

In [2]:
# Read text from file
text = func.read_text_list(const.PATHS)

print (f"{const.bcolors.FAIL}Error while reading{const.bcolors.ENDC}" if text == -1 else text[:200])

0. Projects/3/Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt




                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohem


## Data Preprocess

### Divide the set

In [3]:
# Delete cover and divide text

text = text.lower()

no_cover_text = func.clean_cover_text_str(text, convar.clean_cover)
text_divided = func.divide_text_str(no_cover_text, convar.dividers)

print(text_divided[:10])

['', '', '', '', '                        the adventures of sherlock holmes', '', '                               arthur conan doyle', '', '', '']


### Tokenization

In [4]:
# Create Tokenizer object in python

vocabulary, word_to_idx = func.tokenize_text(func.clean_text(no_cover_text, const.CLEAN_CHARS))
total_words = len(vocabulary) + 1

print(f"total_words: {total_words}")
print(f"Índice de palabras: {word_to_idx}")

total_words: 8299


In [5]:
# Create the different n-gram sequences
input_sequences = func.n_gram_separation(text_divided, const.CLEAN_CHARS, word_to_idx)

input_sequences[:20]

[[234, 5522],
 [234, 5522, 3408],
 [234, 5522, 3408, 53],
 [234, 5522, 3408, 53, 3348],
 [7600, 5043],
 [7600, 5043, 6808],
 [1387, 3408],
 [1387, 3408, 4402],
 [1681, 4396],
 [1681, 4396, 190],
 [1681, 4396, 190, 1864],
 [234, 2702],
 [234, 2702, 3127],
 [1681, 987],
 [1681, 987, 3408],
 [1681, 987, 3408, 5304],
 [234, 6809],
 [234, 6809, 5857],
 [234, 6809, 5857, 2933],
 [234, 3596]]

### Padding

In [6]:
# Get the max value to add padding to other entries
input_seq_pad = func.pad_sequences(input_sequences)

# Metrics printing
average = 0
for seq in input_sequences:
    average += len(seq)
max_sequence_len = max([len(seq) for seq in input_sequences])

print (f"Length average = {average / len(input_sequences)}")
print (f"Max seq length = {max_sequence_len}")
input_seq_pad

Length average = 7.159356725146199
Max seq length = 18


array([[   0,    0,    0, ...,    0,  234, 5522],
       [   0,    0,    0, ...,  234, 5522, 3408],
       [   0,    0,    0, ..., 5522, 3408,   53],
       ...,
       [   0,    0,    0, ..., 7893,  234, 6357],
       [   0,    0,    0, ...,  234, 6357,   62],
       [   0,    0,    0, ..., 6357,   62, 7826]])

## Model Train

### X and Y separation

In [7]:
# Split the sequences into input (X) and output (y)
X, y = func.split_xy(input_seq_pad, total_words)

print (X)
y

[[   0    0    0 ...    0    0  234]
 [   0    0    0 ...    0  234 5522]
 [   0    0    0 ...  234 5522 3408]
 ...
 [   0    0    0 ... 3412 7893  234]
 [   0    0    0 ... 7893  234 6357]
 [   0    0    0 ...  234 6357   62]]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:

dataset = const.TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

### Model creation


In [9]:
model = NextWordPredictor(
    vocab_size = total_words, 
    embed_dim = convar.embed_dim, 
    hidden_dim = convar.hidden_dim, 
    output_dim = total_words, 
    lstm_layers=convar.lstm_layers, 
    dropout=convar.dropout,
    temperature=convar.temperature
    )

criterion = convar.criterion
optimizer = optim.Adam(model.parameters(), lr=convar.lr)



### TRAINING

In [10]:
better_model = func.training_model(model, dataloader, criterion, optimizer, convar.epochs, convar.patience)
#Saving model to .pt
if convar.save_model == True:
  torch.save(better_model, const.MODEL_NAME)

Epoch 1/5, Loss: 8.969016075134277 (Improved)
Epoch 2/5, Loss: 8.932520866394043 (Improved)
Epoch 3/5, Loss: 8.908854484558105 (Improved)
Epoch 4/5, Loss: 8.996313095092773
Epoch 5/5, Loss: 8.966053009033203


## PREDICTION

### Load model

In [13]:
# If model loaded do not run training

better_model = torch.load(const.MODEL_NAME, map_location=torch.device('cpu')) 



In [14]:
# Predict text
seed_text = func.predict_model (better_model, convar.next_words, convar.seed_text, word_to_idx, max_sequence_len)

print(seed_text)

I am not to the very of the very of the very
