In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import re

PATH = "data/sherlock-holm.es_stories_plain-text_advs.txt"

# Read the text file
with open(PATH, 'r', encoding='utf-8') as file:
    text = file.read()


In [2]:
# Tokenize the text
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    return words

tokens = tokenize(text)
word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {word: index + 1 for index, word in enumerate(vocab)}
index_to_word = {index + 1: word for index, word in enumerate(vocab)}
total_words = len(word_to_index) + 1

In [3]:
# Create input-output pairs
input_sequences = []
for line in text.split('\n'):
    token_list = [word_to_index[word] for word in tokenize(line) if word in word_to_index]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [4]:
# Pad the sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array([np.pad(seq, (max_sequence_len - len(seq), 0), mode='constant') for seq in input_sequences])

In [5]:
# Split the sequences into input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

y = torch.tensor(y, dtype=torch.long)
# Convert output to one-hot encoded vectors
y_one_hot = torch.nn.functional.one_hot(y, num_classes=total_words).numpy()

In [6]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define the model
class NextWordPredictor(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout=0.2):
    super(NextWordPredictor, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.dropout_embed = nn.Dropout(dropout)  # Add dropout after embedding
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True)  # Use Bidirectional LSTM with multiple layers
    self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Adjust output size for Bidirectional LSTM

  def forward(self, sequences):
    embedded = self.embedding(sequences)
    embedded = self.dropout_embed(embedded)
    lstm_out, _ = self.lstm(embedded)
    last_hidden = lstm_out[:, -1, :]  # Select last hidden state from the sequence
    logits = self.fc(last_hidden)
    return logits

model = NextWordPredictor(total_words, 100, 150, total_words).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

  self.y = torch.tensor(y, dtype=torch.long)


In [7]:
import torch
torch.cuda.is_available()

a=torch.FloatTensor([1.0,2.0]).cuda()

a.device

torch.manual_seed(32)


next(model.parameters()).is_cuda

True

In [10]:
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import ReduceLROnPlateau

epochs = 200
patience = 5  # Number of epochs to wait for improvement
current_patience = patience
best_loss = float('inf')  # Initialize best loss to a very high value
better_model = model
for epoch in range(epochs):
  # Training loop
  for i, (inputs, labels) in enumerate(dataloader):
    inputs = inputs.cuda()#Para GPU
    labels = labels.cuda() #Para GPU
    
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # Early stopping
  if loss.item() < best_loss:  # Compare current training loss with best loss
    best_loss = loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()} (Improved)')
    better_model = model
    current_patience = patience  # Restart patience
  else:
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
    current_patience -= 1  # Decrement patience counter on no improvement

  # Stop training if patience is 0
  if current_patience == 0:
    print('Early stopping triggered!')
    break

'''
# Optional: Save the best model based on training loss
if best_loss != float('inf'):
  torch.save(model.state_dict(), 'best_model.pt')
'''

Epoch 1/200, Loss: 3.5391814708709717 (Improved)
Epoch 2/200, Loss: 4.0403056144714355
Epoch 3/200, Loss: 3.5400102138519287
Epoch 4/200, Loss: 3.561836004257202
Epoch 5/200, Loss: 3.20314884185791 (Improved)
Epoch 6/200, Loss: 3.497293710708618
Epoch 7/200, Loss: 3.7733376026153564
Epoch 8/200, Loss: 3.0223875045776367 (Improved)
Epoch 9/200, Loss: 2.8497135639190674 (Improved)
Epoch 10/200, Loss: 2.5111589431762695 (Improved)
Epoch 11/200, Loss: 3.3672659397125244
Epoch 12/200, Loss: 2.4599266052246094 (Improved)
Epoch 13/200, Loss: 2.955993413925171
Epoch 14/200, Loss: 2.0853307247161865 (Improved)
Epoch 15/200, Loss: 2.790433645248413
Epoch 16/200, Loss: 2.7190167903900146
Epoch 17/200, Loss: 2.5774054527282715
Epoch 18/200, Loss: 2.854377508163452
Epoch 19/200, Loss: 2.1823272705078125
Early stopping triggered!


"\n# Optional: Save the best model based on training loss\nif best_loss != float('inf'):\n  torch.save(model.state_dict(), 'best_model.pt')\n"

In [16]:
# Generate predictions
def predict_next_words(model, tokenizer, seed_text, next_words):
    model.eval()  # Set the model to evaluation mode
    device = next(model.parameters()).device  # Get the device the model is on
    for _ in range(next_words):
        token_list = [word_to_index[word] for word in tokenize(seed_text)]
        token_list = np.pad(token_list, (max_sequence_len - len(token_list), 0), mode='constant')
        token_list = torch.tensor(token_list[-max_sequence_len+1:], dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            predicted = model(token_list).argmax(dim=1).item()

        output_word = index_to_word[predicted]
        seed_text += " " + output_word

    return seed_text

seed_text = "i am"
next_words = 16
print(predict_next_words(better_model, word_to_index, seed_text, next_words))

i am sure that i may have deduced a little more closely i shall be busy to an
