# Cargar y preparar el Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from datasets import Dataset, DatasetDict

# Carga tu archivo desde la ruta de Drive
def load_custom_dataset(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return Dataset.from_list(data)

# Cambia la ruta a tu ubicación real en Drive
dataset = load_custom_dataset('/content/drive/MyDrive/ColabNotebooks/multimodal_dataset.jsonl')

# División en Train y Test (90/10)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    # Combinamos titular y contenido para dar más contexto
    texts = [f"{t} [SEP] {c}" for t, c in zip(examples['title'], examples['content'])]
    tokenized_inputs = tokenizer(texts, truncation=True, padding='max_length', max_length=512)
    tokenized_inputs["labels"] = examples["is_real"] # Añadimos las etiquetas aquí
    return tokenized_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['group_id', 'is_real', 'title', 'content', 'image_path', 'model'],
        num_rows: 55
    })
    test: Dataset({
        features: ['group_id', 'is_real', 'title', 'content', 'image_path', 'model'],
        num_rows: 7
    })
})


# Ejecutar entrenamiento
## Definir funcion de metricas


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import numpy as np
import evaluate

# Cargamos la métrica estándar de clasificación
metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": acc, "f1": f1}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

## Iniciamos el modelo

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    num_labels=2
)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/198 [00:00<?, ?it/s]

DebertaV2ForSequenceClassification LOAD REPORT from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
classifier.bias                         | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 
pooler.dense.weight      

## Configurar los TrainingArguments

In [None]:
!pip install transformers



In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./resultados_tfg",
    eval_strategy="epoch",     # Evaluar al final de cada época
    save_strategy="epoch",           # Guardar el modelo en cada época
    learning_rate=2e-5,              # Valor estándar para fine-tuning de DeBERTa
    per_device_train_batch_size=8,   # Si te da error de memoria (OOM), baja a 4
    num_train_epochs=3,              # 3 pasadas al dataset suelen bastar para empezar
    weight_decay=0.01,               # Regularización para evitar overfitting
    load_best_model_at_end=True,     # Al final, carga la versión con mejor F1
    logging_steps=10,                # Ver progreso cada 10 pasos
)

## Crear el trainer y lanzar

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# ¡COMENZAR EL ENTRENAMIENTO!
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.622583,0.714286,0.833333
2,0.804448,1.011834,0.714286,0.833333
3,1.222742,0.608645,0.714286,0.833333


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.LayerNorm.weight', 'deberta.encoder.layer.2.output.LayerNorm.bias', 'deberta.encoder.layer.3.attention.output.LayerNorm.weight', 'deberta.encoder.layer.3.attention.output.LayerNorm.bias', 'deberta.encoder.layer.3.output.LayerNorm.weight', 'deberta.encoder.layer.3.output.Laye

TrainOutput(global_step=21, training_loss=1.014496195883978, metrics={'train_runtime': 51.9857, 'train_samples_per_second': 3.174, 'train_steps_per_second': 0.404, 'total_flos': 43414102702080.0, 'train_loss': 1.014496195883978, 'epoch': 3.0})

### Mostrar los resultados de la evaluación

El objeto `trainer` almacena un historial de todas las métricas de entrenamiento y evaluación. Podemos acceder a ellas a través de `trainer.state.log_history`.

In [None]:
# Filtra los logs para obtener solo las métricas de evaluación
eval_results = [log for log in trainer.state.log_history if 'eval_loss' in log]

for epoch_result in eval_results:
    print(f"Epoch {epoch_result['epoch']:.0f}:")
    print(f"  Loss: {epoch_result.get('eval_loss'):.4f}")
    print(f"  Accuracy: {epoch_result.get('eval_accuracy'):.4f}")
    print(f"  F1 Score: {epoch_result.get('eval_f1'):.4f}")
    print("---------------------------------")

Epoch 1:
  Loss: 0.6226
  Accuracy: 0.7143
  F1 Score: 0.8333
---------------------------------
Epoch 2:
  Loss: 1.0118
  Accuracy: 0.7143
  F1 Score: 0.8333
---------------------------------
Epoch 3:
  Loss: 0.6086
  Accuracy: 0.7143
  F1 Score: 0.8333
---------------------------------


In [None]:
# Evaluar en el conjunto de test
eval_results = trainer.evaluate()
print(f"Resultados finales: {eval_results}")

# Guardar el modelo entrenado
#trainer.save_model("/content/drive/MyDrive/TFG/modelos/deberta_v1")

Resultados finales: {'eval_loss': 0.6086452603340149, 'eval_accuracy': 0.7142857142857143, 'eval_f1': 0.8333333333333334, 'eval_runtime': 0.3072, 'eval_samples_per_second': 22.784, 'eval_steps_per_second': 3.255, 'epoch': 3.0}
