# Entrenamiento Multilabel con PASCAL VOC 2007

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score, accuracy_score

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU disponible: {len(tf.config.list_physical_devices('GPU')) > 0}")

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

TensorFlow version: 2.20.0
GPU disponible: False


In [2]:
PROJECT_ROOT = Path(os.getcwd()).parent
DATA_DIR = PROJECT_ROOT / 'data' / 'voc2007'
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

IMG_SIZE = (224, 224)
BATCH_SIZE = 16
INITIAL_EPOCHS = 30
FINETUNING_EPOCHS = 40
LEARNING_RATE_INITIAL = 0.0005
LEARNING_RATE_FINETUNING = 0.00005

print(f"Configuracion:")
print(f"  Tamaño imagen: {IMG_SIZE}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epocas inicial: {INITIAL_EPOCHS}")
print(f"  Epocas fine-tuning: {FINETUNING_EPOCHS}")

Configuracion:
  Tamaño imagen: (224, 224)
  Batch size: 16
  Epocas inicial: 30
  Epocas fine-tuning: 40


In [3]:
print(f"Cargando desde: {DATA_DIR}")

with open(DATA_DIR / 'classes.json', 'r') as f:
    classes = json.load(f)

NUM_CLASSES = len(classes)

print(f"Clases cargadas: {NUM_CLASSES}")
print(f"Primeras 10 clases: {classes[:10]}")

Cargando desde: c:\Users\mlata\Documents\iajordy2\data\voc2007
Clases cargadas: 20
Primeras 10 clases: ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow']


In [4]:
print("Cargando dataset PASCAL VOC 2007 desde NPZ...")

# Cargar NPZ
npz_file = DATA_DIR / 'voc2007_multilabel.npz'
if not npz_file.exists():
    raise FileNotFoundError(f"No se encuentra {npz_file}. Ejecuta primero 01_data_analysis.ipynb")

data = np.load(npz_file)
images = data['images']
labels = data['labels']

print(f"Imagenes cargadas: {images.shape}")
print(f"Labels cargados: {labels.shape}")
print(f"Clases por imagen (promedio): {labels.sum(axis=1).mean():.2f}")

# Normalizar imagenes a [0, 1]
images = images.astype(np.float32) / 255.0

print(f"Imagenes normalizadas a rango [0, 1]")

Cargando dataset PASCAL VOC 2007 desde NPZ...
Imagenes cargadas: (2501, 224, 224, 3)
Labels cargados: (2501, 20)
Clases por imagen (promedio): 1.61
Imagenes normalizadas a rango [0, 1]


In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(
    images, labels, test_size=0.3, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED
)

print(f"Train: {len(X_train)} imágenes")
print(f"Val: {len(X_val)} imágenes")
print(f"Test: {len(X_test)} imágenes")

print(f"Train labels: {y_train.sum(axis=1).mean():.2f} categorías/imagen")
print(f"Test labels: {y_test.sum(axis=1).mean():.2f} categorías/imagen")
print(f"Val labels: {y_val.sum(axis=1).mean():.2f} categorías/imagen")

Train: 1750 imágenes
Val: 375 imágenes
Test: 376 imágenes
Train labels: 1.62 categorías/imagen
Test labels: 1.59 categorías/imagen
Val labels: 1.57 categorías/imagen


In [None]:
# Calcular pesos por clase para combatir desbalance
pos_counts = y_train.sum(axis=0)
neg_counts = y_train.shape[0] - pos_counts

# Peso positivo = negativos / positivos (LIMITADO a max 10)
pos_weight = (neg_counts + 1e-6) / (pos_counts + 1e-6)
pos_weight = np.clip(pos_weight, 1.0, 10.0)  # Max 10 en vez de 50

class_weights = tf.constant(pos_weight, dtype=tf.float32)

print("Pesos por clase calculados (limitados a max 10)")
print(f"  Min: {pos_weight.min():.2f}")
print(f"  Max: {pos_weight.max():.2f}")
print(f"  Media: {pos_weight.mean():.2f}")

# Focal Loss con class weights (mejor para desbalance)
def focal_loss(y_true, y_pred, gamma=2.0, alpha=0.25):
    """
    Focal Loss para multilabel con class weights.
    gamma: factor de enfoque (mayor = más peso a ejemplos difíciles)
    alpha: balance positivo/negativo base
    """
    y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
    
    # Focal Loss componentes
    bce = -(y_true * tf.math.log(y_pred) + (1 - y_true) * tf.math.log(1 - y_pred))
    
    # Modulación focal: (1 - p_t)^gamma
    p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
    focal_weight = tf.pow(1 - p_t, gamma)
    
    # Aplicar peso focal y class weights
    focal_bce = focal_weight * bce
    weighted_focal = focal_bce * (y_true * class_weights + (1 - y_true) * 1.0)
    
    return tf.reduce_mean(weighted_focal)

print("Focal Loss con class weights definida (gamma=2.0)")

Pesos por clase calculados
  Min: 1.30
  Max: 46.30
  Media: 17.92
Loss ponderada definida


In [7]:
# Data augmentation para training
train_datagen = ImageDataGenerator(
    rotation_range=25,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

# Sin augmentation para val/test (ya están normalizadas)
val_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()

# Fit datagen en datos de train
train_datagen.fit(X_train)

print("Generadores de datos creados")
print(f"  Train samples: {len(X_train)}")
print(f"  Val samples: {len(X_val)}")
print(f"  Test samples: {len(X_test)}")
print(f"  Batch size: {BATCH_SIZE}")

Generadores de datos creados
  Train samples: 1750
  Val samples: 375
  Test samples: 376
  Batch size: 16


In [8]:
def create_multilabel_model(num_classes, img_size=(224, 224)):
    inputs = layers.Input(shape=(*img_size, 3))
    base_model = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=inputs)
    base_model.trainable = False
    x = base_model.output
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(num_classes, activation='sigmoid')(x)
    model = models.Model(inputs=inputs, outputs=outputs)
    return model, base_model

model, base_model = create_multilabel_model(NUM_CLASSES, IMG_SIZE)
print(f"Modelo creado")
print(f"Total parametros: {model.count_params():,}")

Modelo creado
Total parametros: 4,841,911


In [None]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=LEARNING_RATE_INITIAL),
    loss=focal_loss,
    metrics=[
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc', multi_label=True)
    ]
)

callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True, verbose=1),
    keras.callbacks.ModelCheckpoint(filepath=str(MODELS_DIR / 'model_phase1_best.h5'), monitor='val_loss', save_best_only=True, verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-8, verbose=1)
]

print(f"Modelo compilado - FASE 1: Training inicial con Focal Loss")

history_phase1 = model.fit(
    train_datagen.flow(X_train, y_train, batch_size=BATCH_SIZE),
    epochs=INITIAL_EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

print(f"Fase 1 completada")

Modelo compilado - FASE 1: Training inicial
Epoch 1/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - auc: 0.4832 - loss: 1.3156 - precision: 0.0799 - recall: 0.5050
Epoch 1: val_loss improved from None to 1.27778, saving model to c:\Users\mlata\Documents\iajordy2\models\model_phase1_best.h5




[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 232ms/step - auc: 0.4993 - loss: 1.3008 - precision: 0.0851 - recall: 0.5094 - val_auc: 0.4982 - val_loss: 1.2778 - val_precision: 0.0962 - val_recall: 0.6740 - learning_rate: 5.0000e-04
Epoch 2/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step - auc: 0.4935 - loss: 1.2929 - precision: 0.0836 - recall: 0.4946
Epoch 2: val_loss improved from 1.27778 to 1.26505, saving model to c:\Users\mlata\Documents\iajordy2\models\model_phase1_best.h5




[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 201ms/step - auc: 0.4949 - loss: 1.2837 - precision: 0.0808 - recall: 0.4652 - val_auc: 0.4996 - val_loss: 1.2650 - val_precision: 0.1070 - val_recall: 0.6129 - learning_rate: 5.0000e-04
Epoch 3/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step - auc: 0.4844 - loss: 1.2788 - precision: 0.0866 - recall: 0.4849
Epoch 3: val_loss improved from 1.26505 to 1.26443, saving model to c:\Users\mlata\Documents\iajordy2\models\model_phase1_best.h5




[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 208ms/step - auc: 0.4954 - loss: 1.2774 - precision: 0.0862 - recall: 0.4917 - val_auc: 0.5000 - val_loss: 1.2644 - val_precision: 0.0756 - val_recall: 0.2886 - learning_rate: 5.0000e-04
Epoch 4/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - auc: 0.5024 - loss: 1.2781 - precision: 0.0792 - recall: 0.4939
Epoch 4: val_loss did not improve from 1.26443
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 208ms/step - auc: 0.4956 - loss: 1.2749 - precision: 0.0761 - recall: 0.4899 - val_auc: 0.5000 - val_loss: 1.2650 - val_precision: 0.0549 - val_recall: 0.3497 - learning_rate: 5.0000e-04
Epoch 5/30
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - auc: 0.4818 - loss: 1.2662 - precision: 0.0607 - recall: 0.4021
Epoch 5: val_loss did not improve from 1.26443
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 217ms/step - auc: 0.489

In [None]:
base_model.trainable = True
fine_tune_at = len(base_model.layers) - 40
for layer in base_model.layers[:fine_tune_at]:
    layer.trainable = False

model.compile(
    optimizer=optimizers.Adam(learning_rate=LEARNING_RATE_FINETUNING),
    loss=focal_loss,
    metrics=[
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc', multi_label=True)
    ]
)

train_datagen_ft = ImageDataGenerator(
    rotation_range=25,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest',
    vertical_flip=False
)

print(f"FASE 2: Fine-tuning con ultimas {len(base_model.layers) - fine_tune_at} capas descongeladas")

history_phase2 = model.fit(
    train_datagen_ft.flow(X_train, y_train, batch_size=BATCH_SIZE),
    epochs=FINETUNING_EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

print(f"Fase 2 completada")

FASE 2: Fine-tuning con ultimas 40 capas descongeladas
Epoch 1/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - auc: 0.4972 - loss: 1.2751 - precision: 0.0885 - recall: 0.5031
Epoch 1: val_loss did not improve from 1.26443
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 233ms/step - auc: 0.5129 - loss: 1.2819 - precision: 0.0882 - recall: 0.5016 - val_auc: 0.5067 - val_loss: 1.2647 - val_precision: 0.1157 - val_recall: 0.5891 - learning_rate: 5.0000e-05
Epoch 2/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step - auc: 0.5013 - loss: 1.2757 - precision: 0.0808 - recall: 0.4591
Epoch 2: val_loss did not improve from 1.26443
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 227ms/step - auc: 0.5045 - loss: 1.2798 - precision: 0.0812 - recall: 0.4603 - val_auc: 0.4911 - val_loss: 1.2646 - val_precision: 0.1114 - val_recall: 0.5739 - learning_rate: 5.0000e-05
Epoch 3/40
[1m110/110[0m [32m━━━



[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 222ms/step - auc: 0.5046 - loss: 1.2751 - precision: 0.0781 - recall: 0.4097 - val_auc: 0.4968 - val_loss: 1.2642 - val_precision: 0.0601 - val_recall: 0.3735 - learning_rate: 2.5000e-05
Epoch 11/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - auc: 0.4886 - loss: 1.2616 - precision: 0.0764 - recall: 0.3893
Epoch 11: val_loss did not improve from 1.26417
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 221ms/step - auc: 0.4921 - loss: 1.2765 - precision: 0.0762 - recall: 0.3871 - val_auc: 0.5014 - val_loss: 1.2647 - val_precision: 0.0692 - val_recall: 0.3939 - learning_rate: 2.5000e-05
Epoch 12/40
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - auc: 0.4779 - loss: 1.2647 - precision: 0.0737 - recall: 0.3932
Epoch 12: val_loss did not improve from 1.26417
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 224ms/step - auc: 0

In [None]:
y_val_pred = model.predict(X_val, verbose=1)

# Buscar umbral optimo global por F1-micro (rango ajustado 0.2-0.8)
thresholds = np.arange(0.2, 0.85, 0.05)
f1_scores = []
for thresh in thresholds:
    y_val_pred_binary = (y_val_pred >= thresh).astype(int)
    f1_scores.append(f1_score(y_val, y_val_pred_binary, average='micro'))

best_idx = int(np.argmax(f1_scores))
best_threshold = float(thresholds[best_idx])
print(f"Threshold optimo (global): {best_threshold:.2f}")
print(f"F1-micro max (global): {f1_scores[best_idx]:.4f}")

# Umbral optimo por clase
best_thresholds = []
for c in range(NUM_CLASSES):
    f1_c = []
    for thresh in thresholds:
        pred_c = (y_val_pred[:, c] >= thresh).astype(int)
        f1_c.append(f1_score(y_val[:, c], pred_c, average='binary', zero_division=0))
    best_thresholds.append(float(thresholds[int(np.argmax(f1_c))]))

best_thresholds = np.array(best_thresholds)
print(f"Thresholds por clase (promedio): {best_thresholds.mean():.2f}")
print(f"Thresholds por clase (min-max): {best_thresholds.min():.2f} - {best_thresholds.max():.2f}")

# Metricas finales con thresholds por clase
y_val_pred_binary = (y_val_pred >= best_thresholds).astype(int)
positive_rate = y_val_pred_binary.mean()
metrics_phase2 = {
    'hamming_loss': hamming_loss(y_val, y_val_pred_binary),
    'subset_accuracy': accuracy_score(y_val, y_val_pred_binary),
    'f1_micro': f1_score(y_val, y_val_pred_binary, average='micro'),
    'f1_macro': f1_score(y_val, y_val_pred_binary, average='macro'),
    'precision_micro': precision_score(y_val, y_val_pred_binary, average='micro'),
    'recall_micro': recall_score(y_val, y_val_pred_binary, average='micro'),
}

print("\nMETRICAS FINALES EN VALIDACION")
for metric, value in metrics_phase2.items():
    print(f"{metric}: {value:.4f}")
print(f"Tasa de positivos predichos: {positive_rate:.4f}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 427ms/step
Threshold optimo (global): 0.05
F1-micro max (global): 0.1456
Thresholds por clase (promedio): 0.07
METRICAS FINALES EN VALIDACION
hamming_loss: 0.9211
subset_accuracy: 0.0000
f1_micro: 0.1457
f1_macro: 0.1366
precision_micro: 0.0786
recall_micro: 1.0000
Tasa de positivos predichos: 0.9996


In [12]:
model.save(MODELS_DIR / 'voc_multilabel_final.h5')
model.save(MODELS_DIR / 'voc_multilabel_final.keras')
print(f"Modelo guardado")

with open(MODELS_DIR / 'training_results.json', 'w') as f:
    json.dump({
        'metrics': metrics_phase2,
        'config': {
            'initial_epochs': INITIAL_EPOCHS,
            'finetuning_epochs': FINETUNING_EPOCHS,
            'batch_size': BATCH_SIZE,
            'img_size': IMG_SIZE,
            'learning_rate_initial': LEARNING_RATE_INITIAL,
            'learning_rate_finetuning': LEARNING_RATE_FINETUNING
        },
        'thresholds': best_thresholds.tolist()
    }, f, indent=2)
print(f"Resultados guardados")



Modelo guardado
Resultados guardados
