In [None]:
pip install tensorflow

In [None]:
pip install torchtext


In [None]:
pip install torch torchvision

In [None]:
pip install nltk

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer
import nltk
from nltk.tokenize import word_tokenize
# Read the text file
with open('./data/sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    text = text.lower()

In [None]:
print(text)

In [None]:
tokens = word_tokenize(text)
vocabulary = set(tokens)
total_words = len(vocabulary) + 1

word_to_idx = {word:idx for idx, word in enumerate(vocabulary)}

print(f"total_words: {total_words}")
print("Índice de palabras:", word_to_idx)

In [None]:
input_sequences = []
for line in text.split('\n'):
    line_list = line.rstrip(",.;:").split(' ')
    token_list = []
    for char in line_list:
        if char in word_to_idx.keys():
            token_list.append(word_to_idx[char])

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
# Imprimiendo las secuencias de n-gramas
print(f"Secuencias de n-gramas: {input_sequences[:5]}")

In [None]:
from torch.nn.utils.rnn import pad_sequence


In [None]:
# Determinamos la longitud máxima de las secuencias.
max_sequence_len = max(len(seq) for seq in input_sequences)

# Añadimos padding al principio de cada secuencia.
padded_sequences = []
for seq in input_sequences:
    num_padding = max_sequence_len - len(seq)
    padded_seq = [0] * num_padding + seq
    padded_sequences.append(padded_seq)

# Convertimos las secuencias en tensores.
tensor_sequences = torch.tensor(padded_sequences)



In [None]:
print(f"max_sequence_len: {max_sequence_len}")
print(f"Secuencias de n-gramas: {tensor_sequences[:5]}")

In [None]:
X = tensor_sequences[:, :-1]
y = tensor_sequences[:, -1]

In [None]:
print(X)
print(y)

In [None]:
import torch.nn.functional as F

In [None]:
# Convertir las etiquetas a one-hot encoding
num_classes = y.max().item() + 1  # Determinar el número de clases
y = F.one_hot(y, num_classes=num_classes)

In [None]:
print("Secuencias con padding (X):")
print(X)
print("Etiquetas (y) en one-hot encoding:")
print(y)

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, total_words, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(total_words, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out , _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]
        x = self.fc(lstm_out)
        x = self.softmax(x)
        return x

In [None]:
model = LSTMModel(total_words, 100, 150, total_words)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(model, dataloader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for X_batch, y_batch in dataloader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, torch.max(y_batch, 1)[1])
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

In [None]:
train_model(model, dataloader, criterion, optimizer, epochs=100)

In [None]:
model.eval()
#Guardar modelo
torch.save(model.state_dict(), "model.pth")

#Guardar tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(word_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [None]:
# Texto inicial y número de palabras a predecir
seed_text = "I will leave if they"
next_words = 3

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Tokenizamos el seed_text y lo convertimos a índices
tokens = word_tokenize(seed_text)
input_sequence = [word_to_idx[word] for word in tokens if word in word_to_idx]

# Generamos las siguientes palabras
model.eval()  # Configuramos el modelo en modo evaluación
for _ in range(next_words):
    # Convertimos la secuencia de entrada a tensor y añadimos una dimensión
    input_tensor = torch.tensor(input_sequence).unsqueeze(0)

    # Realizamos la predicción con el modelo
    with torch.no_grad():
        output = model(input_tensor)

    # Obtenemos el índice de la palabra predicha
    predicted_idx = torch.argmax(output, dim=1).item()

    # Añadimos la palabra predicha a la secuencia de entrada
    input_sequence.append(predicted_idx)

    # Añadimos la palabra predicha al texto semilla
    word = idx_to_word[predicted_idx]
    seed_text += " " + word

print(seed_text)

I will leave if they the the the
