In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
import re

PATH = "data\sherlock-holm.es_stories_plain-text_advs.txt"

# Read the text file
with open(PATH, 'r', encoding='utf-8') as file:
    text = file.read()

In [2]:
# Tokenize the text
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    return words

tokens = tokenize(text)
word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_index = {word: index + 1 for index, word in enumerate(vocab)}
index_to_word = {index + 1: word for index, word in enumerate(vocab)}
total_words = len(word_to_index) + 1

In [3]:
# Create input-output pairs
input_sequences = []
for line in text.split('\n'):
    token_list = [word_to_index[word] for word in tokenize(line) if word in word_to_index]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [4]:
# Pad the sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array([np.pad(seq, (max_sequence_len - len(seq), 0), mode='constant') for seq in input_sequences])

In [5]:
# Split the sequences into input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert output to one-hot encoded vectors
y = np.array(torch.nn.functional.one_hot(torch.tensor(y, dtype=torch.long), num_classes=total_words))

In [6]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)

# Split dataset into training and validation sets (90% training, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

model = NextWordPredictor(total_words, 200, 256, total_words).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PyTorchModel(BaseEstimator, ClassifierMixin):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=150, output_dim=None, lr=0.001, epochs=10):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lr = lr
        self.epochs = epochs
        self.model = NextWordPredictor(vocab_size, embed_dim, hidden_dim, output_dim).to(device)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

    def fit(self, X, y):
        dataset = TextDataset(X, y)
        train_size = int(0.9 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        self.model.train()
        for epoch in range(self.epochs):
            for inputs, labels in train_dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels.argmax(dim=1))
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        return self

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.long).to(device))
            return outputs.argmax(dim=1).cpu().numpy()

    def score(self, X, y):
        self.model.eval()
        correct_predictions = 0
        total_predictions = 0
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.long).to(device))
            predicted = outputs.argmax(dim=1).cpu()
            actual = torch.tensor(y, dtype=torch.long).cpu()
            correct_predictions = (predicted == actual).sum().item()
            total_predictions = len(y)
        return correct_predictions / total_predictions


# Define the parameter grid
param_grid = {
    'embed_dim': [50, 100, 200],
    'hidden_dim': [128, 256, 512],
    'lr': [0.001, 0.0001]
}

# Perform grid search
grid_search = GridSearchCV(PyTorchModel(vocab_size=total_words, output_dim=total_words), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# After finding the best parameters, you can train the final model with them
#best_params = grid_search.best_params_
#best_model = PyTorchModel(vocab_size=total_words, output_dim=total_words, **best_params)
#best_model.fit(X, y)

Traceback (most recent call last):
  File "s:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "s:\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "s:\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "s:\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "s:\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "s:\a

Best parameters found:  {'embed_dim': 50, 'hidden_dim': 128, 'lr': 0.001}
Best score:  nan


In [1]:
import torch
torch.cuda.is_available()

True

In [28]:
# Coda setting
import torch
print(f"Is coda avialable: {torch.cuda.is_available()}")

a=torch.FloatTensor([1.0,2.0]).cuda()
a.device
torch.manual_seed(32)


next(model.parameters()).is_cuda
gpumodel = model.cuda()

Is coda avialable: True


In [29]:
import torch

# Verifica si hay una GPU disponible y, si no, usa la CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Mueve el modelo al dispositivo
model.to(device)

# Suponiendo que el criterio y el optimizador se hayan definido previamente
criterion = criterion.to(device)

epochs = 200
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_dataloader:
        # Mueve los datos de entrada y las etiquetas al dispositivo
        inputs, labels = inputs.to(device), labels.to(device)

        # Realiza las predicciones
        outputs = model(inputs)

        # Calcula la pérdida
        loss = criterion(outputs, labels.argmax(dim=1))

        # Optimiza el modelo
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# Guarda el estado del modelo
torch.save(model.state_dict(), "model_state.pth")




Epoch 1/200, Loss: 6.5344085693359375
Epoch 2/200, Loss: 4.913583278656006
Epoch 3/200, Loss: 5.6329474449157715
Epoch 4/200, Loss: 4.490845680236816
Epoch 5/200, Loss: 2.9339778423309326
Epoch 6/200, Loss: 2.1444764137268066
Epoch 7/200, Loss: 2.306044101715088
Epoch 8/200, Loss: 2.8308041095733643
Epoch 9/200, Loss: 1.9185158014297485
Epoch 10/200, Loss: 1.5864721536636353
Epoch 11/200, Loss: 1.256124496459961
Epoch 12/200, Loss: 0.9370671510696411
Epoch 13/200, Loss: 1.331300973892212
Epoch 14/200, Loss: 0.9904576539993286
Epoch 15/200, Loss: 1.1658934354782104
Epoch 16/200, Loss: 0.8523066639900208
Epoch 17/200, Loss: 0.6752649545669556
Epoch 18/200, Loss: 0.4826428294181824
Epoch 19/200, Loss: 0.3789665400981903
Epoch 20/200, Loss: 0.5691863298416138
Epoch 21/200, Loss: 0.5297027230262756
Epoch 22/200, Loss: 0.22712139785289764
Epoch 23/200, Loss: 0.5868541598320007
Epoch 24/200, Loss: 0.36076849699020386
Epoch 25/200, Loss: 1.021597146987915
Epoch 26/200, Loss: 0.3002721667289734

In [31]:
# Evaluate the model on the validation set
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for inputs, labels in val_dataloader:
        # Mueve los datos de entrada y las etiquetas al dispositivo
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        predicted = outputs.argmax(dim=1)
        actual = labels.argmax(dim=1)
        correct_predictions += (predicted == actual).sum().item()
        total_predictions += labels.size(0)

accuracy = correct_predictions / total_predictions
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.0921


In [33]:
import time

# Evaluate the model on the validation set
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for inputs, labels in val_dataloader:
        # Mueve los datos de entrada y las etiquetas al dispositivo
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        predicted = outputs.argmax(dim=1)
        actual = labels.argmax(dim=1)

        # Display results
        for i in range(len(inputs)):
            input_sentence = " ".join([index_to_word[idx.item()] for idx in inputs[i] if idx.item() != 0])
            predicted_word = index_to_word[predicted[i].item()]
            actual_word = index_to_word[actual[i].item()]
            correct = predicted[i].item() == actual[i].item()
            print(f"Sentence: {input_sentence}")
            print(f"Predicted: {predicted_word}")
            print(f"Actual: {actual_word}")
            print(f"Correct: {correct}")
            print()

            if correct:
                correct_predictions += 1
            total_predictions += 1

            time.sleep(3)  # Wait for 1 second between predictions

accuracy = correct_predictions / total_predictions
print(f'Validation Accuracy: {accuracy:.4f}')

Sentence: me to bring him
Predicted: and
Actual: out
Correct: False

Sentence: this account of you we
Predicted: are
Actual: have
Correct: False

Sentence: is all that district and there were
Predicted: six
Actual: marks
Correct: False

Sentence: no
Predicted: i
Actual: great
Correct: False

Sentence: peterson
Predicted: the
Actual: run
Correct: False

Sentence: for
Predicted: the
Actual: three
Correct: False

Sentence: grounds a stableboy
Predicted: acquaintance
Actual: had
Correct: False

Sentence: sherlock holmes looked deeply chagrined he drew
Predicted: out
Actual: a
Correct: False

Sentence: our friends premises
Predicted: by
Actual: and
Correct: False

Sentence: it as the front of a
Predicted: man
Actual: picture
Correct: False

Sentence: the garden behind into the stable lane so
Predicted: that
Actual: long
Correct: False

Sentence: that it helps us much i think myself that it is
Predicted: not
Actual: a
Correct: False

Sentence: as a trivial example of
Predicted: his
Actual: o

KeyboardInterrupt: 

In [35]:
# Generate predictions
def predict_next_words(model, tokenizer, seed_text, next_words):
    for _ in range(next_words):
        token_list = [word_to_index[word] for word in tokenize(seed_text)]
        token_list = np.pad(token_list, (max_sequence_len - len(token_list), 0), mode='constant')
        token_list = torch.tensor(token_list[-max_sequence_len+1:], dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            predicted = model(token_list).argmax(dim=1).item()

        output_word = index_to_word[predicted]
        seed_text += " " + output_word

    return seed_text

# Verifica si hay una GPU disponible y, si no, usa la CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Mueve el modelo al dispositivo
model.to(device)

seed_text = "i do not know if i have been a"
next_words = 10
print(predict_next_words(model, word_to_index, seed_text, next_words))

i do not know if i have been a little too much responded his friend i have carry my
