In [4]:
import faiss

def prepare_faiss_index(model, tokenizer, file_path, device, batch_size=4):
    # Cargar datos
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Tokenizar el texto
    tokenized_text = tokenizer.encode(text, return_tensors='pt').to(device)

    # Crear el índice FAISS
    d = model.config.n_embd  # Dimensión del embedding
    index = faiss.IndexFlatL2(d)

    # Generar embeddings y añadir al índice
    with torch.no_grad():
        for i in range(0, tokenized_text.size(1), batch_size):
            inputs = tokenized_text[:, i:i + batch_size].to(device)
            outputs = model.transformer(inputs).last_hidden_state.mean(dim=1).cpu().numpy()
            index.add(outputs)

    return index

index = prepare_faiss_index(model, tokenizer, "train_data.txt", device)


Token indices sequence length is longer than the specified maximum sequence length for this model (8492 > 1024). Running this sequence through the model will result in indexing errors


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.optim.lr_scheduler import StepLR
import json
import numpy as np
import time

# Cargar y preparar datos
def prepare_data():
    with open('data.json', 'r', encoding='utf-8') as f:
        faqs = json.load(f)
    with open('train_data.txt', 'w', encoding='utf-8') as f:
        for faq in faqs:
            f.write(f"Pregunta: {faq['pregunta']}\nRespuesta: {faq['respuesta']}\n\n")

prepare_data()

# Dataset personalizado
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        self.examples = []
        with open(file_path, encoding='utf-8') as f:
            text = f.read()
        tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, item):
        return torch.tensor(self.examples[item])

# Inicialización de componentes del modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
dataset = TextDataset("train_data.txt", tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Optimizador y scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

# Funciones de entrenamiento y evaluación
def train(model, dataloader, optimizer, device, num_epochs=1):
    model.train()
    total_loss = 0
    for epoch in range(num_epochs):
        for batch in dataloader:
            inputs, labels = batch.to(device), batch.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1} completed, Average Loss: {total_loss / len(dataloader)}")

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch.to(device), batch.to(device)
            outputs = model(inputs, labels=labels)
            total_loss += outputs.loss.item()
    print(f"Validation Loss: {total_loss / len(dataloader)}")
    return total_loss / len(dataloader)

# Ejecutar entrenamiento y evaluación
for epoch in range(5):  # Aumentar según sea necesario
    train(model, train_dataloader, optimizer, device)
    val_loss = evaluate(model, val_dataloader, device)

def generate_response(question, model, tokenizer, max_length=100, device='cuda'):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer.encode(question, return_tensors='pt').to(device)
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response




Epoch 1 completed, Average Loss: 4.0974261944110575
Validation Loss: 3.714701771736145
Epoch 1 completed, Average Loss: 3.547596582999596
Validation Loss: 3.4991186261177063
Epoch 1 completed, Average Loss: 3.2630369113041806
Validation Loss: 3.3871349096298218
Epoch 1 completed, Average Loss: 3.0413368298457217
Validation Loss: 3.295798420906067
Epoch 1 completed, Average Loss: 2.8610922189859243
Validation Loss: 3.2620521187782288


In [10]:
# Ejemplo de uso
question = "Donde queda la espe inca"
response = generate_response(question, model, tokenizer, device=device)
print(response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Donde queda la espe incañol de la vida de los más.

"I'm sorry, but I'm not going to be able to go to the hospital. I don't want to see you. You're not here to help me. It's not my fault. If you want me to, I'll go and see the doctor. But I can't go. My body is broken. And I want you to come with me."
..
