In [17]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments



In [18]:
# Verificar si hay GPU disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")


Usando dispositivo: cuda


In [19]:
# Cargar los datos procesados
df = pd.read_csv('data/Finaltrain.csv')

In [20]:
# Preparar las etiquetas (content y wording)
df['labels'] = list(zip(df['content'], df['wording']))

In [21]:
# Dividir los datos en entrenamiento y validación
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['labels'].tolist(), test_size=0.2, random_state=42
)

In [22]:
# Inicializar el tokenizer de BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
# Tokenizar los textos
def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")


In [24]:
train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)

In [25]:
# Mover las etiquetas y encodings a GPU
train_labels = torch.tensor(train_labels, dtype=torch.float32).to(device)
val_labels = torch.tensor(val_labels, dtype=torch.float32).to(device)
train_encodings = {key: tensor.to(device) for key, tensor in train_encodings.items()}
val_encodings = {key: tensor.to(device) for key, tensor in val_encodings.items()}


In [26]:
# Dataset personalizado sin mover los tensores a GPU manualmente
class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: tensor[idx] for key, tensor in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor
        return item

In [27]:
# Crear los datasets de entrenamiento y validación
train_dataset = SummaryDataset(train_encodings, train_labels)
val_dataset = SummaryDataset(val_encodings, val_labels)

In [28]:
# Cargar el modelo y moverlo a GPU si está disponible
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Configuración del entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    fp16=True,  # Activar precisión mixta
    dataloader_pin_memory=False if torch.cuda.is_available() else True
)


In [33]:
# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [34]:
# Entrenar el modelo
print("Iniciando entrenamiento...")
trainer.train()

  5%|▍         | 52/1077 [07:42<2:31:52,  8.89s/it]


Iniciando entrenamiento...


  item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.19 GiB is allocated by PyTorch, and 440.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Evaluar el modelo en el conjunto de validación
print("Evaluando el modelo...")
val_predictions = trainer.predict(val_dataset).predictions

In [None]:
# Calcular el MSE para content y wording
mse_content = mean_squared_error([label[0] for label in val_labels], [pred[0] for pred in val_predictions])
mse_wording = mean_squared_error([label[1] for label in val_labels], [pred[1] for pred in val_predictions])


In [None]:
print(f"MSE Content: {mse_content}, MSE Wording: {mse_wording}")