In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Verificar si hay GPU disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")


Usando dispositivo: cuda


In [3]:
# Cargar los datos procesados
df = pd.read_csv('data/Finaltrain.csv')

In [4]:
# Preparar las etiquetas (content y wording)
df['labels'] = list(zip(df['content'], df['wording']))

In [5]:
# Dividir los datos en entrenamiento y validación
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['labels'].tolist(), test_size=0.2, random_state=42
)

In [6]:
# Inicializar el tokenizer de BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
# Tokenizar los textos
def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")


In [8]:
train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)

In [9]:
# Mover las etiquetas y encodings a GPU
train_labels = torch.tensor(train_labels, dtype=torch.float32).to(device)
val_labels = torch.tensor(val_labels, dtype=torch.float32).to(device)
train_encodings = {key: tensor.to(device) for key, tensor in train_encodings.items()}
val_encodings = {key: tensor.to(device) for key, tensor in val_encodings.items()}


In [10]:
# Dataset personalizado sin mover los tensores a GPU manualmente
class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: tensor[idx] for key, tensor in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor
        return item

In [11]:
# Crear los datasets de entrenamiento y validación
train_dataset = SummaryDataset(train_encodings, train_labels)
val_dataset = SummaryDataset(val_encodings, val_labels)

In [12]:
# Cargar el modelo y moverlo a GPU si está disponible
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Configuración del entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    fp16=True,  # Activar precisión mixta
    dataloader_pin_memory=False if torch.cuda.is_available() else True
)




In [14]:
# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [15]:
# Entrenar el modelo
print("Iniciando entrenamiento...")
trainer.train()

Iniciando entrenamiento...


  item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor
  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                  
 33%|███▎      | 180/540 [08:43<08:28,  1.41s/it]

{'eval_loss': -2.988182544708252, 'eval_runtime': 8.3165, 'eval_samples_per_second': 172.307, 'eval_steps_per_second': 5.411, 'epoch': 1.0}


                                                 
 67%|██████▋   | 360/540 [14:53<04:13,  1.41s/it]

{'eval_loss': -3.849613904953003, 'eval_runtime': 8.273, 'eval_samples_per_second': 173.214, 'eval_steps_per_second': 5.439, 'epoch': 2.0}


 93%|█████████▎| 500/540 [19:34<01:21,  2.04s/it]

{'loss': -3.0796, 'grad_norm': 9.378430366516113, 'learning_rate': 1.62962962962963e-06, 'epoch': 2.78}


  item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor
  item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor
                                                 
100%|██████████| 540/540 [21:30<00:00,  2.39s/it]

{'eval_loss': -4.108611583709717, 'eval_runtime': 8.591, 'eval_samples_per_second': 166.802, 'eval_steps_per_second': 5.238, 'epoch': 3.0}
{'train_runtime': 1290.3812, 'train_samples_per_second': 13.326, 'train_steps_per_second': 0.418, 'train_loss': -3.1690847043637875, 'epoch': 3.0}





TrainOutput(global_step=540, training_loss=-3.1690847043637875, metrics={'train_runtime': 1290.3812, 'train_samples_per_second': 13.326, 'train_steps_per_second': 0.418, 'total_flos': 4524457707970560.0, 'train_loss': -3.1690847043637875, 'epoch': 3.0})

In [16]:
# Evaluar el modelo en el conjunto de validación
print("Evaluando el modelo...")
val_predictions = trainer.predict(val_dataset).predictions

  item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)  # Etiquetas en tensor


Evaluando el modelo...


100%|██████████| 45/45 [00:10<00:00,  4.18it/s]


In [18]:
trainer.save_model("models/bert_model")  # Guardar el modelo

In [22]:
# Calcular el MSE para content y wording
val_content = [label[0].item() for label in val_labels]  # Mover a CPU solo si es necesario
val_wording = [label[1].item() for label in val_labels]
pred_content = [pred[0] for pred in val_predictions]  # No necesitas .cpu().item() aquí
pred_wording = [pred[1] for pred in val_predictions]

mse_content = mean_squared_error(val_content, pred_content)
mse_wording = mean_squared_error(val_wording, pred_wording)


In [23]:
print(f"MSE Content: {mse_content}, MSE Wording: {mse_wording}")

MSE Content: 70.91881304097141, MSE Wording: 51.50081649176773


In [24]:
from sklearn.metrics import r2_score
r2_content = r2_score(val_content, pred_content)
r2_wording = r2_score(val_wording, pred_wording)

In [25]:
print(f"R2 Content: {r2_content}, R2 Wording: {r2_wording}")

R2 Content: -64.49001631198864, R2 Wording: -49.608163149615116
