<a href="https://colab.research.google.com/github/JuanDiaz77/Proyecto-colab/blob/main/Actividad_pr%C3%A1ctica_Fine_tuning_de_modelos_Transformers_con_HuggingFace_en_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===========================================
# 1️ Instalación de librerías
# ===========================================
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"

!pip install -q transformers datasets torchvision

import torch
from datasets import load_dataset
from transformers import (
    CLIPProcessor,
    CLIPModel,
    TrainingArguments,
    Trainer
)
from torch import nn

# ===========================================
# 2️ Cargar dataset multimodal
# ===========================================
dataset = load_dataset("cifar10")

# Agregar texto descriptivo a cada imagen
def transform_examples(example):
    example["text"] = f"A photo of a {example['label']}."
    return example

dataset = dataset["train"].map(transform_examples)
print("Ejemplo del dataset:")
print(dataset[0])

# ===========================================
# 3️ Preparar modelo CLIP preentrenado
# ===========================================
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
base_model = CLIPModel.from_pretrained(model_name)

# ===========================================
# 4️ Crear clase CLIP personalizada compatible con Trainer
# ===========================================
class CLIPForCustomTraining(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.clip = clip_model
        self.temperature = nn.Parameter(torch.tensor(1.0))

    def forward(self, pixel_values, input_ids, attention_mask, labels=None):
        outputs = self.clip(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            return_loss=False
        )

        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

        # Normalizar embeddings
        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

        logits = torch.matmul(text_embeds, image_embeds.t()) * self.temperature

        loss = None
        if labels is not None:
            # Crear etiquetas válidas dentro del batch
            batch_size = logits.size(0)
            target = torch.arange(batch_size, device=logits.device)
            loss_fn = nn.CrossEntropyLoss()
            loss = (loss_fn(logits, target) + loss_fn(logits.t(), target)) / 2

        return {"loss": loss, "logits": logits}

model = CLIPForCustomTraining(base_model)

# ===========================================
# 5️ Preprocesamiento del dataset
# ===========================================
def preprocess_function(examples):
    inputs = processor(
        text=examples["text"],
        images=examples["img"],
        padding=True,
        truncation=True
    )
    return inputs

processed_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["img", "text"])

# Reducir tamaño para prueba rápida (opcional)
processed_dataset = processed_dataset.select(range(1000))

# ===========================================
# 6️ Configuración de entrenamiento
# ===========================================
training_args = TrainingArguments(
    output_dir="./clip-finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
)

# ===========================================
# 7️ Ejecutar entrenamiento
# ===========================================
trainer.train()

print("✅ Entrenamiento finalizado correctamente.")
