In [1]:
# === Imports & config ===
import os, json, math, time, random
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

from sklearn.metrics import classification_report, confusion_matrix

# Dispositivo y seeds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device, "| CUDA:", torch.cuda.is_available())
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.set_float32_matmul_precision('high')

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if device.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# Rutas (ajusta si usas otro nombre)
DATA_DIR = Path(r"dataset_dividido_v1")     # contiene train/ val/ test/
CLASSES_TXT = DATA_DIR / "classes.txt"      # opcional, solo informativo
CLASS_IDX_JSON = DATA_DIR / "class_indices.json"  # opcional

# Hiperparámetros
IMG_SIZE   = 224
PATCH_SIZE = 16
EMBED_DIM  = 256
DEPTH      = 6
HEADS      = 8
MLP_RATIO  = 4
DROPOUT    = 0.1

BATCH_SIZE = 64
EPOCHS     = 30
LR         = 3e-4
WD         = 0.05
USE_AMP    = (device.type == "cuda")

# Cargar nombres de clases (si existen)
if CLASS_IDX_JSON.exists():
    with open(CLASS_IDX_JSON, "r", encoding="utf-8") as f:
        class_map = json.load(f)
    idx2name = [class_map[str(i)] for i in range(len(class_map))]
else:
    idx2name = None
print("Idx->Name:", idx2name)


Device: cuda | CUDA: True
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
Idx->Name: ['Condones', 'Cajas de condones', 'Esponjas', 'Botellas de Cloro', 'Placas madre']


In [2]:
# === Data transforms ===
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

train_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomApply([transforms.ColorJitter(0.1,0.1,0.1,0.05)], p=0.3),
    transforms.RandomHorizontalFlip(p=0.2),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

eval_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

ds = {
    "train": datasets.ImageFolder(DATA_DIR / "train", transform=train_tfms),
    "val":   datasets.ImageFolder(DATA_DIR / "val",   transform=eval_tfms),
    "test":  datasets.ImageFolder(DATA_DIR / "test",  transform=eval_tfms),
}
NUM_CLASSES = len(ds["train"].classes)
print("Clases detectadas:", ds["train"].classes, "| NUM_CLASSES:", NUM_CLASSES)

dl = {
    "train": DataLoader(ds["train"], batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True),
    "val":   DataLoader(ds["val"],   batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2, pin_memory=True),
    "test":  DataLoader(ds["test"],  batch_size=BATCH_SIZE*2, shuffle=False, num_workers=2, pin_memory=True),
}


Clases detectadas: ['Botellas de Cloro', 'Cajas de condones', 'Condones', 'Esponjas', 'Placas madre'] | NUM_CLASSES: 5


In [None]:
# === Tiny Vision Transformer===
class PatchEmbed(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_ch=3, embed_dim=256):
        super().__init__()
        self.img_size   = img_size
        self.patch_size = patch_size
        self.grid = (img_size // patch_size, img_size // patch_size)
        self.num_patches = self.grid[0] * self.grid[1]
        # Conv2d actúa como proyector de parches
        self.proj = nn.Conv2d(in_ch, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):  # x: [B,3,H,W]
        x = self.proj(x)   # [B,embed_dim, H/P, W/P]
        x = x.flatten(2).transpose(1,2)  # [B, N, embed_dim]
        return x

class TinyViT(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_ch=3,
                 embed_dim=256, depth=6, heads=8, mlp_ratio=4, num_classes=5, drop=0.1):
        super().__init__()
        self.patch = PatchEmbed(img_size, patch_size, in_ch, embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1,1,embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + self.patch.num_patches, embed_dim))
        self.pos_drop  = nn.Dropout(drop)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=heads,
            dim_feedforward=int(embed_dim*mlp_ratio),
            dropout=drop, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        nn.init.trunc_normal_(self.head.weight, std=0.02)
        nn.init.constant_(self.head.bias, 0)

    def forward(self, x):  # x: [B,3,224,224]
        B = x.size(0)
        x = self.patch(x)                          # [B, N, D]
        cls = self.cls_token.expand(B, -1, -1)     # [B, 1, D]
        x = torch.cat([cls, x], dim=1)             # [B, 1+N, D]
        x = x + self.pos_embed[:, :x.size(1), :]   # PE
        x = self.pos_drop(x)
        x = self.encoder(x)                        # [B, 1+N, D]
        x = self.norm(x[:, 0])                     # CLS
        return self.head(x)                        # [B, C]

model = TinyViT(
    img_size=IMG_SIZE, patch_size=PATCH_SIZE,
    embed_dim=EMBED_DIM, depth=DEPTH, heads=HEADS,
    mlp_ratio=MLP_RATIO, num_classes=NUM_CLASSES, drop=DROPOUT
).to(device)

sum_params = sum(p.numel() for p in model.parameters() if p.requires_grad)


TinyViT(
  (patch): PatchEmbed(
    (proj): Conv2d(3, 256, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.1, inplace=False)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=256, out_features=5, bias=True)
)
Parámetros entrenabl



In [None]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD, betas=(0.9, 0.999))
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=LR*0.05)

# Nota: en tu torch 2.8, usa la versión CUDA para evitar el argumento device_type
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler(enabled=USE_AMP)

BEST_PATH = Path("best_tinyvit_residuos.pt")
best_val_loss = float("inf"); patience = 7; bad = 0

def run_epoch(model, loader, train=True):
    model.train(train)
    tot_loss = 0.0; tot_corr = 0; tot = 0
    for xb, yb in loader:
        xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)

        with autocast(enabled=USE_AMP):
            logits = model(xb)
            loss = criterion(logits, yb)

        if train:
            optimizer.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()

        bs = yb.size(0)
        tot += bs
        tot_loss += loss.item()*bs
        tot_corr += (logits.argmax(1) == yb).sum().item()
    return tot_loss/tot, tot_corr/tot

print("== Entrenamiento TinyViT ==")
for ep in range(1, EPOCHS+1):
    tr_loss, tr_acc = run_epoch(model, dl["train"], True)
    va_loss, va_acc = run_epoch(model, dl["val"],   False)
    scheduler.step()
    print(f"[{ep:02d}/{EPOCHS}] train_loss={tr_loss:.4f} acc={tr_acc:.4f} | val_loss={va_loss:.4f} acc={va_acc:.4f}")

    if va_loss < best_val_loss - 1e-4:
        best_val_loss = va_loss; bad = 0
        torch.save(model.state_dict(), BEST_PATH)
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping activao pa'"); break

print("✓ Mejor modelo guardado en:", BEST_PATH)


  scaler = GradScaler(enabled=USE_AMP)


== Entrenamiento TinyViT ==


  with autocast(enabled=USE_AMP):


[01/30] train_loss=1.2510 acc=0.5031 | val_loss=1.1241 acc=0.5831
[02/30] train_loss=1.1051 acc=0.5942 | val_loss=1.1298 acc=0.5455
[03/30] train_loss=1.0346 acc=0.6249 | val_loss=1.0351 acc=0.6144
[04/30] train_loss=0.9806 acc=0.6564 | val_loss=1.0132 acc=0.6332
[05/30] train_loss=0.9342 acc=0.6746 | val_loss=0.9381 acc=0.6834
[06/30] train_loss=0.8961 acc=0.6853 | val_loss=0.9056 acc=0.6928
[07/30] train_loss=0.8506 acc=0.7163 | val_loss=0.9259 acc=0.6865
[08/30] train_loss=0.8180 acc=0.7362 | val_loss=0.8582 acc=0.7116
[09/30] train_loss=0.7888 acc=0.7503 | val_loss=0.8567 acc=0.7179
[10/30] train_loss=0.7559 acc=0.7589 | val_loss=0.9232 acc=0.6803
[11/30] train_loss=0.7372 acc=0.7763 | val_loss=0.7920 acc=0.7524
[12/30] train_loss=0.7012 acc=0.7882 | val_loss=0.8462 acc=0.7179
[13/30] train_loss=0.6744 acc=0.7996 | val_loss=0.8289 acc=0.7461
[14/30] train_loss=0.6425 acc=0.8150 | val_loss=0.8535 acc=0.7524
[15/30] train_loss=0.6067 acc=0.8291 | val_loss=0.8223 acc=0.7712
[16/30] tr

In [6]:
# Cargar mejor estado
state = torch.load(BEST_PATH, map_location=device)
model.load_state_dict(state)
model.eval()

all_true, all_pred = [], []
with torch.no_grad():
    for xb, yb in dl["test"]:
        xb = xb.to(device)
        logits = model(xb)
        all_pred.append(logits.argmax(1).cpu().numpy())
        all_true.append(yb.numpy())

y_true = np.concatenate(all_true)
y_pred = np.concatenate(all_pred)

print("TEST accuracy:", (y_true==y_pred).mean())
names = idx2name if idx2name else [str(c) for c in range(NUM_CLASSES)]
print("\n== Classification report ==")
print(classification_report(y_true, y_pred, target_names=names, digits=4))

cm = confusion_matrix(y_true, y_pred, labels=list(range(NUM_CLASSES)))
print("Confusion matrix (rows true, cols pred):\n", cm)


TEST accuracy: 0.7292110874200426

== Classification report ==
                   precision    recall  f1-score   support

         Condones     0.6959    0.7744    0.7330       195
Cajas de condones     0.7870    0.6351    0.7029       285
         Esponjas     0.6343    0.6894    0.6607       161
Botellas de Cloro     0.7358    0.7800    0.7573       150
     Placas madre     0.7898    0.8435    0.8158       147

         accuracy                         0.7292       938
        macro avg     0.7286    0.7445    0.7339       938
     weighted avg     0.7341    0.7292    0.7283       938

Confusion matrix (rows true, cols pred):
 [[151  20  10  11   3]
 [ 39 181  35  12  18]
 [ 13  15 111  14   8]
 [ 10   7  12 117   4]
 [  4   7   7   5 124]]
