# Cargar y preparar el dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import numpy as np
import random
from transformers import set_seed

def fix_all_seeds(seed=42):
    # Semilla para librerías estándar
    random.seed(seed)
    np.random.seed(seed)

    # Semilla para PyTorch (CPU y GPU)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Semilla específica de Transformers (afecta a inicialización de pesos y dropout)
    set_seed(seed)

    # Garantizar determinismo en algoritmos de la GPU (puede ralentizar un poco)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    print(f"Semilla {seed} fijada globalmente.")

fix_all_seeds(42)

Semilla 42 fijada globalmente.


In [10]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import GroupShuffleSplit

def load_and_split_grouped_dataset(path, test_size=0.1, val_size=0.1, seed=42):
    # 1. Cargar el archivo JSONL a un DataFrame de Pandas
    with open(path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    df = pd.DataFrame(data)

    # 2. Configurar el divisor por grupos (group_id)
    # Esto garantiza que las parejas nunca se separen entre train y test
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)

    # Obtenemos los índices para la división
    train_idx, test_idx = next(gss.split(df, groups=df['group_id']))

    df_train_full = df.iloc[train_idx]
    df_test = df.iloc[test_idx]

    print(f"[*] Total de noticias (pares): {len(df) // 2}")
    print(f"[*] Pares en Entrenamiento: {len(df_train_full) // 2}")
    print(f"[*] Pares en Test: {len(df_test) // 2}")

    # Segunda división: Del Train total, sacamos un pedazo para Validación
    gss_val = GroupShuffleSplit(n_splits=1, test_size=val_size, random_state=seed)
    train_idx_final, val_idx = next(gss_val.split(df_train_full, groups=df_train_full['group_id']))

    df_train_final = df_train_full.sample(frac=1, random_state=seed).reset_index(drop=True)
    df_test = df_test.sample(frac=1, random_state=seed).reset_index(drop=True)

    return DatasetDict({
        'train': Dataset.from_pandas(df_train_full.iloc[train_idx_final]),
        'validation': Dataset.from_pandas(df_train_full.iloc[val_idx]),
        'test': Dataset.from_pandas(df_test)
    })

# Ruta a tu dataset en Drive
path_dataset = '/content/drive/MyDrive/TFG/multimodal_dataset.jsonl'
dataset = load_and_split_grouped_dataset(path_dataset)

[*] Total de noticias (pares): 1050
[*] Pares en Entrenamiento: 945
[*] Pares en Test: 105


In [11]:
# Comptobacion
train_ids = set(dataset['train']['group_id'])
test_ids = set(dataset['test']['group_id'])

# La intersección debe ser un conjunto vacío
overlap = train_ids.intersection(test_ids)
print(f"Coincidencias de IDs entre train y test: {len(overlap)}") # Debe dar 0

Coincidencias de IDs entre train y test: 0


In [12]:
from transformers import AutoTokenizer

model_checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    # Combinamos titular y contenido para dar más contexto
    texts = [f"{t} [SEP] {c}" for t, c in zip(examples['title'], examples['content'])]
    tokenized_inputs = tokenizer(texts, truncation=True, padding='max_length', max_length=512)
    tokenized_inputs["labels"] = examples["is_real"] # Añadimos las etiquetas aquí
    return tokenized_inputs
'''
def preprocess_function(examples):
    # Opcional: Limpiar artefactos para forzar aprendizaje semántico
    clean_contents = [re.sub(r'\n\n|\n|\s{2,}', ' ', c) for c in examples['content']]

    # Combinamos titular y contenido limpio
    texts = [f"{t} [SEP] {c}" for t, c in zip(examples['title'], clean_contents)]
    tokenized_inputs = tokenizer(texts, truncation=True, padding='max_length', max_length=512)
    tokenized_inputs["labels"] = examples["is_real"]
    return tokenized_inputs
'''
tokenized_dataset = dataset.map(preprocess_function, batched=True)

  clean_contents = [re.sub(r'\n\n|\n|\s{2,}', ' ', c) for c in examples['content']]


Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

In [15]:
print(tokenized_dataset['validation']['labels'][:100])

[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]


# Cargar modelo y ejecutar entrenamiento


## Definir funciones de metricas

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import numpy as np
import evaluate

# Cargamos la métrica estándar de clasificación
metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": acc, "f1": f1}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Para la optimización de hiperparámetros se implementó un Weights & Biases Sweep empleando Optimización Bayesiana. Este enfoque permite modelar la función de pérdida mediante un proceso Gaussiano, optimizando la búsqueda en el espacio de parámetros (Search Space) y convergiendo hacia el mínimo global de forma más eficiente que un muestreo aleatorio (Random Search).

In [None]:
import wandb
import re
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# 1. Definir la configuración del Sweep
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'eval_f1', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {
            'min': 1e-6,
            'max': 5e-5
        },
        'num_train_epochs': {
            'values': [3, 5]
        },
        'per_device_train_batch_size': {
            'values': [8, 16]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="tfg-deberta-tuning")

def train_iteration():
    # Iniciar un nuevo "run" de W&B
    with wandb.init():
        config = wandb.config

        # --- CRÍTICO: Cargar modelo limpio en cada iteración ---
        # Esto evita el "leaking" de aprendizaje entre pruebas
        model = AutoModelForSequenceClassification.from_pretrained(
            "microsoft/deberta-v3-base",
            num_labels=2
        )

        args = TrainingArguments(
            output_dir="./temp_checkpoints",
            report_to="wandb",
            learning_rate=config.learning_rate,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            eval_strategy="epoch",
            save_strategy="epoch", # No guardamos basura en disco durante el sweep
            load_best_model_at_end=True, # Necesario para EarlyStopping
            metric_for_best_model="f1",
            fp16=False
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["test"],
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()

# Lanzar el agente
wandb.agent(sweep_id, function=train_iteration, count=10) # 'count' es el número de experimentos

Create sweep with ID: 7gahe6ox
Sweep URL: https://wandb.ai/javierprior04-universidad-de-murcia/tfg-deberta-tuning/sweeps/7gahe6ox


[34m[1mwandb[0m: Agent Starting Run: ugs7uvlv with config:
[34m[1mwandb[0m: 	learning_rate: 1.0187594352450212e-05
[34m[1mwandb[0m: 	num_train_epochs: 5
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Loading weights:   0%|          | 0/198 [00:00<?, ?it/s]

DebertaV2ForSequenceClassification LOAD REPORT from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
mask_predictions.dense.bias             | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
pooler.dense.weight                     | MISSING    | 
classifier.bias                         | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias        

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,,0.5,0.0
2,No log,,0.5,0.0
3,No log,,0.5,0.0


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.LayerNorm.weight', 'deberta.encoder.layer.2.output.LayerNorm.bias', 'deberta.encoder.layer.3.attention.output.LayerNorm.weight', 'deberta.encoder.layer.3.attention.output.LayerNorm.bias', 'deberta.encoder.layer.3.output.LayerNorm.weight', 'deberta.encoder.layer.3.output.Laye

0,1
eval/accuracy,▁▁▁
eval/f1,▁▁▁
eval/runtime,█▁█
eval/samples_per_second,▁█▁
eval/steps_per_second,▁█▁
train/epoch,▁▅██
train/global_step,▁▄██
+1,...

0,1
eval/accuracy,0.5
eval/f1,0
eval/loss,
eval/runtime,4.0103
eval/samples_per_second,52.365
eval/steps_per_second,6.733
total_flos,1341890447155200.0
train/epoch,3
train/global_step,321
train_loss,10.44453


[34m[1mwandb[0m: Agent Starting Run: xj3lwsv0 with config:
[34m[1mwandb[0m: 	learning_rate: 1.7879565851186087e-05
[34m[1mwandb[0m: 	num_train_epochs: 5
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Loading weights:   0%|          | 0/198 [00:00<?, ?it/s]

DebertaV2ForSequenceClassification LOAD REPORT from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
mask_predictions.dense.bias             | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
pooler.dense.weight                     | MISSING    | 
classifier.bias                         | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias        

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,,0.5,0.0
2,No log,,0.5,0.0
3,0.723306,,0.5,0.0


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.LayerNorm.weight', 'deberta.encoder.layer.2.output.LayerNorm.bias', 'deberta.encoder.layer.3.attention.output.LayerNorm.weight', 'deberta.encoder.layer.3.attention.output.LayerNorm.bias', 'deberta.encoder.layer.3.output.LayerNorm.weight', 'deberta.encoder.layer.3.output.Laye

0,1
eval/accuracy,▁▁▁
eval/f1,▁▁▁
eval/runtime,▁▄█
eval/samples_per_second,█▅▁
eval/steps_per_second,█▅▁
train/epoch,▁▅▆██
train/global_step,▁▅▆██
train/learning_rate,▁
train/loss,▁
+2,...

0,1
eval/accuracy,0.5
eval/f1,0
eval/loss,
eval/runtime,3.9122
eval/samples_per_second,53.678
eval/steps_per_second,6.901
total_flos,1341890447155200.0
train/epoch,3
train/global_step,639
train/grad_norm,


[34m[1mwandb[0m: Agent Starting Run: 20akwudz with config:
[34m[1mwandb[0m: 	learning_rate: 1.4342496442037695e-05
[34m[1mwandb[0m: 	num_train_epochs: 3
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Loading weights:   0%|          | 0/198 [00:00<?, ?it/s]

DebertaV2ForSequenceClassification LOAD REPORT from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
mask_predictions.dense.bias             | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
pooler.dense.weight                     | MISSING    | 
classifier.bias                         | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias        

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,,0.5,0.0


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


## Entrenar con los mejores hyperparametros

In [None]:
# --- CELDA DE ENTRENAMIENTO FINAL (FUERA DEL SWEEP) ---

# 1. Configura aquí los mejores parámetros encontrados por el Sweep
best_config = {
    "learning_rate": 2e-5,
    "num_train_epochs": 3,
    "batch_size": 8
}

model_final = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    num_labels=2
)

final_args = TrainingArguments(
    seed=42,
    data_seed=42,
    full_determinism=True,
    output_dir="./resultados_finales",
    learning_rate=best_config["learning_rate"],
    num_train_epochs=best_config["num_train_epochs"],
    per_device_train_batch_size=best_config["batch_size"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True
)

trainer = Trainer(
    model=model_final,
    args=final_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"], # Usamos validation aquí
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

# Ahora sí puedes ejecutar estas líneas:
eval_test = trainer.evaluate(tokenized_dataset["test"]) # Evaluación final en TEST
print(f"Resultados en el conjunto de TEST: {eval_test}")

# Guardado dinámico
import os
# Descomenta la carpeta según el experimento que estés corriendo
save_path = "/content/drive/MyDrive/TFG/modelos/deberta_v1_sin_limpieza"
# save_path = "/content/drive/MyDrive/TFG/modelos/deberta_v1_con_limpieza"

trainer.save_model(save_path)