In [1]:
# Import list

import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import ReduceLROnPlateau
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import functions as func
import constants as const
import controlVariables as convar
from NextWordPredictorModel import NextWordPredictor

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data 

In [2]:
# Read text from file
text = func.read_text_list(const.PATHS)

print (f"{const.bcolors.FAIL}Error while reading{const.bcolors.ENDC}" if text == -1 else text[:200])

Project-III/data/sherlock-holm.es_stories_plain-text_advs.txt




                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohem


## Data Preprocess

### Divide the set

In [3]:
# Delete cover and divide text

text = text.lower()

no_cover_text = func.clean_cover_text_str(text, convar.clean_cover)
text_divided = func.divide_text_str(no_cover_text, convar.dividers)

print(text_divided[:10])

['',
 '',
 '',
 '',
 '                        the adventures of sherlock holmes',
 '',
 '                               arthur conan doyle',
 '',
 '',
 '']

### Tokenization

In [4]:
# Create Tokenizer object in python

vocabulary, word_to_idx = func.tokenize_text(func.clean_text(no_cover_text, const.CLEAN_CHARS))
total_words = len(vocabulary) + 1

print(f"total_words: {total_words}")
print(f"Índice de palabras: {word_to_idx}")

total_words: 8965


In [5]:
# Create the different n-gram sequences
input_sequences = func.n_gram_separation(text_divided, const.CLEAN_CHARS, word_to_idx)

input_sequences[:20]

[[4289, 3048],
 [4289, 3048, 7797],
 [2791, 7797],
 [2791, 7797, 6411],
 [5930, 6397],
 [5930, 6397, 5616],
 [4289, 1],
 [4289, 1, 8276],
 [5930, 7497],
 [5930, 7497, 7797],
 [5930, 7497, 7797, 8500],
 [4289, 8963],
 [4289, 180],
 [4289, 180, 429],
 [4289, 180, 429, 2794],
 [4289, 7809],
 [4289, 7809, 3069],
 [4289, 7809, 3069, 4289],
 [4289, 7809, 3069, 4289, 8780],
 [4289, 7809, 3069, 4289, 8780, 4125],
 [4289, 1386],
 [4289, 1386, 7797],
 [4289, 1386, 7797, 4289],
 [4289, 1386, 7797, 4289, 3706],
 [4289, 1386, 7797, 4289, 3706, 880],
 [4289, 1386],
 [4289, 1386, 7797],
 [4289, 1386, 7797, 4289],
 [4289, 1386, 7797, 4289, 597],
 [4289, 1386, 7797, 4289, 597, 8767],
 [4289, 1386],
 [4289, 1386, 7797],
 [4289, 1386, 7797, 4289],
 [4289, 1386, 7797, 4289, 6929],
 [4289, 1386],
 [4289, 1386, 7797],
 [4289, 1386, 7797, 4289],
 [4289, 1386, 7797, 4289, 8566],
 [4289, 1386, 7797, 4289, 8566, 6753],
 [4289, 1386],
 [4289, 1386, 7797],
 [4289, 1386, 7797, 4289],
 [4289, 1386, 7797, 4289, 3372

### Padding

In [6]:
# Get the max value to add padding to other entries
input_seq_pad = func.pad_sequences(input_sequences)

# Metrics printing
average = 0
for seq in input_sequences:
    average += len(seq)
max_sequence_len = max([len(seq) for seq in input_sequences])

print (f"Length average = {average / len(input_sequences)}")
print (f"Max seq length = {max_sequence_len}")
input_seq_pad

Length average = 7.159356725146199
Max seq length = 18


array([[   0,    0,    0, ...,    0, 4289, 3048],
       [   0,    0,    0, ..., 4289, 3048, 7797],
       [   0,    0,    0, ...,    0, 2791, 7797],
       ...,
       [   0,    0,    0, ..., 7933, 4474, 4289],
       [   0,    0,    0, ..., 4474, 4289, 8446],
       [   0,    0,    0, ..., 4289, 8446, 8354]])

## Model Train

### X and Y separation

In [7]:
# Split the sequences into input (X) and output (y)
X, y = func.split_xy(input_seq_pad, total_words)

print (X)
y

[[   0    0    0 ...    0    0 4289]
 [   0    0    0 ...    0 4289 3048]
 [   0    0    0 ...    0    0 2791]
 ...
 [   0    0    0 ... 2421 7933 4474]
 [   0    0    0 ... 7933 4474 4289]
 [   0    0    0 ... 4474 4289 8446]]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:

dataset = const.TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

## MODELS

### Multiple LSTM layers

In [9]:
# Define the model
class NextWordPredictor(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.2, temperature=1):
    super(NextWordPredictor, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.dropout_embed = nn.Dropout(dropout)  # Add dropout after embedding
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True)  # Use Bidirectional LSTM with multiple layers
    self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Adjust output size for Bidirectional LSTM
    self.softmax = nn.Softmax(dim=1)  # Add a softmax layer
    self.T = temperature

  def forward(self, sequences):
    embedded = self.embedding(sequences)
    embedded = self.dropout_embed(embedded)
    lstm_out, _ = self.lstm(embedded)
    last_hidden = lstm_out[:, -1, :]  # Select last hidden state from the sequence
    logits = self.fc(last_hidden)
    logits = self.softmax(logits/self.T)  # Apply softmax
    return logits



In [None]:
#Grid Search


### Simple LSTM

In [10]:
# Train the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, temperature=1):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)  # Add a softmax layer
        self.T = temperature

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        x = self.softmax(x/self.T)  # Apply softmax
        return x

### Model creation


In [11]:
model = NextWordPredictor(
    vocab_size = total_words, 
    embed_dim = convar.embed_dim, 
    hidden_dim = convar.hidden_dim, 
    output_dim = total_words, 
    lstm_layers=convar.lstm_layers, 
    dropout=convar.dropout,
    temperature=convar.temperature
    )

criterion = convar.criterion
optimizer = optim.Adam(model.parameters(), lr=convar.lr)



### TRAINING

In [10]:
better_model = func.training_model(model, dataloader, criterion, optimizer, convar.epochs, convar.patience)
#Saving model to .pt
if convar.save_model == True:
  torch.save(better_model, const.MODEL_NAME)

Epoch 1/5, Loss: 5.836637496948242 (Improved)
Epoch 2/5, Loss: 5.737040042877197 (Improved)
Epoch 3/5, Loss: 5.2909698486328125 (Improved)
Epoch 4/5, Loss: 4.072874069213867 (Improved)
Epoch 5/5, Loss: 4.171646595001221


## PREDICTION

### Load model

In [12]:
# If model loaded do not run training
MODEL_PATH = '/teamspace/studios/this_studio/Project-III/model/MD1_MIN-Y_200-0.2-300-2_CEL-Adam_0.001.pt'

better_model = torch.load(MODEL_PATH, map_location=torch.device('cpu')) 



In [13]:
# Initial text to predict
seed_text = func.predict_model (better_model, convar.next_words, convar.seed_text, word_to_idx, max_sequence_len)

print(seed_text)

AttributeError: 'collections.OrderedDict' object has no attribute 'eval'