In [142]:
!pip install torchvision
!pip install timm



In [176]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import timm
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [178]:
# 2. Definir rutas y cargar etiquetas
IMG_DIR = '../data/images'
labels_path = '../artifacts/clean_labels.csv'
df = pd.read_csv(labels_path)
MODEL_NAME = 'vit_base_patch16_224'
BATCH_SIZE = 16
EPOCHS = 3

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [180]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['Category'])

# Dividir en train y validación
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Transformaciones
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

In [182]:
# Dataset personalizado
class ImageCSVData(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['filename'])
        image = Image.open(img_path).convert('RGB')
        label = row['label']

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)

In [184]:
# Datasets y Dataloaders
train_dataset = ImageCSVData(train_df, IMG_DIR, transform=transform)
val_dataset = ImageCSVData(val_df, IMG_DIR, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Modelo
model = timm.create_model(MODEL_NAME, pretrained=True, num_classes=len(le.classes_))
model.to(DEVICE)


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [185]:
# Entrenamiento
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss: {running_loss:.4f}")

[Epoch 1] Loss: 23.8776
[Epoch 2] Loss: 21.3326
[Epoch 3] Loss: 21.1455


In [186]:
import mlflow
import mlflow.pytorch

with mlflow.start_run():
    mlflow.sklearn.log_model("tranformer", "model")
    mlflow.log_param("modelo", "vit_base_patch16_224")
    mlflow.log_param("épocas", 5)
    mlflow.log_param("tasa_aprendizaje", 1e-4)
    mlflow.pytorch.log_model(model, "modelo_vit")
    mlflow.end_run()

