# Creación del modelo

In [None]:
# Instalación de dependencias
!pip install -q kagglehub tabulate matplotlib scikit-learn seaborn

## Recolección de las imagenes

In [None]:
import kagglehub, os, random, shutil
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.applications import MobileNetV2, ResNet50
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Descarga y prepara el dataset
path = kagglehub.dataset_download("frabbisw/facial-age")
original_dir = os.path.join(path, "face_age")
if not os.path.isdir(original_dir):
    original_dir = path

random.seed(42)
data_dir = "data"
LABELS = {'<18': range(1,18), '18+': range(18,200)}
MAX_PER_CLASS = 10000

# Recolectar imágenes
all_imgs = []
for sub in os.listdir(original_dir):
    age_str = sub.rstrip('+')
    try:
        age = int(age_str)
    except ValueError:
        continue
    label = '<18' if age < 18 else '18+'
    folder = os.path.join(original_dir, sub)
    for fn in os.listdir(folder):
        all_imgs.append((os.path.join(folder, fn), label))

# Balancear clases
by_label = {'<18': [], '18+': []}
for path_img,lab in all_imgs:
    by_label[lab].append(path_img)

sampled = []
for lab, paths in by_label.items():
    n = min(len(paths), MAX_PER_CLASS)
    sampled += [(p, lab) for p in random.sample(paths, n)]

# Split estratificado
labels = [lab for _,lab in sampled]
train, temp = train_test_split(sampled, test_size=0.30, stratify=labels, random_state=42)
labels_temp = [lab for _,lab in temp]
val, test  = train_test_split(temp, test_size=0.50, stratify=labels_temp, random_state=42)

# Crear carpetas y copiar imágenes
for split, dataset in (('train',train), ('val',val), ('test',test)):
    for label in ['<18','18+']:
        os.makedirs(os.path.join(data_dir, split, label), exist_ok=True)
    for src, lab in dataset:
        dst = os.path.join(data_dir, split, lab, os.path.basename(src))
        if not os.path.exists(dst):
            shutil.copy(src, dst)


Downloading from https://www.kaggle.com/api/v1/datasets/download/frabbisw/facial-age?dataset_version_number=1...


100%|██████████| 840M/840M [00:08<00:00, 102MB/s]

Extracting files...





## Configuración y preparación de datos

In [None]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 30

TRAIN_DIR = 'data/train'
VAL_DIR = 'data/val'
TEST_DIR = 'data/test'

train_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.7, 1.3],
    zoom_range=0.25,
    shear_range=0.18,
    fill_mode='nearest'
)
val_test_gen = ImageDataGenerator(rescale=1./255)

train_photos = train_gen.flow_from_directory(TRAIN_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE, class_mode='binary')

val_photos = val_test_gen.flow_from_directory(VAL_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE, class_mode='binary', shuffle=False)

test_photos = val_test_gen.flow_from_directory(TEST_DIR, target_size=IMG_SIZE, batch_size=BATCH_SIZE, class_mode='binary', shuffle=False)

#Pesos de clase automáticos
counter = Counter(train_photos.classes)
total = sum(counter.values())
class_weight = {int(i): total/v for i, v in counter.items()}
print(f"Pesos de clase: {class_weight}")

# Función para crear varios modelos de manera automatica
def build_cnn_model(base_model_class=MobileNetV2, num_dense=64, dropout1=0.5, dropout2=0.3):
    base = base_model_class(input_shape=(*IMG_SIZE,3), include_top=False, weights='imagenet')

    base.trainable = True

    for layer in base.layers[:-20]:
        layer.trainable = False

    pool = layers.GlobalAveragePooling2D()(base.output)

    normaliz = layers.BatchNormalization()(pool)

    dropout = layers.Dropout(dropout1)(normaliz)

    dense = layers.Dense(num_dense, activation='relu')(dropout)

    normaliz = layers.BatchNormalization()(dense)

    dropout = layers.Dropout(dropout2)(normaliz)

    output = layers.Dense(1, activation='sigmoid')(dropout)

    model = models.Model(base.input, output)

    model.compile(optimizer=optimizers.Adam(1e-4),
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model

#Definición de modelos y nombres
modelos = [
    (MobileNetV2,     "MobileNetV2"),
    (ResNet50,        "ResNet50")
]

m_path = '/content/modelos'
os.makedirs(m_path, exist_ok=True)

resultados = []
histories = {}
thresholds = {}
confusions = {}

for model_class, model_name in modelos:
    print("\n//////////////////////////////////////")
    print(f"\n Entrenando modelo: {model_name}")

    es = callbacks.EarlyStopping(monitor='val_auc', patience=8, restore_best_weights=True, mode='max')

    mc = callbacks.ModelCheckpoint(
        filepath=f"{m_path}/{model_name}.keras",
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1
    )

    # LLamamos a la funcion build_cnn_model
    model = build_cnn_model(base_model_class=model_class, num_dense=64, dropout1=0.5, dropout2=0.3)

    # Entrenamos al modelo
    history = model.fit(
        train_photos,
        epochs=EPOCHS,
        validation_data=val_photos,
        callbacks=[es, mc],
        class_weight=class_weight,
        verbose=1
    )
    histories[model_name] = history

    # Evaluación y threshold óptimo
    y_true = train_photos.classes
    y_pred_proba = model.predict(test_photos).ravel()
    fpr, tpr, lista_umbral = roc_curve(y_true, y_pred_proba)
    optimal_idx = np.argmax(tpr - fpr)
    umbral_optimo = lista_umbral[optimal_idx]
    thresholds[model_name] = umbral_optimo

    # Guardar threshold
    threshold_path = f"{m_path}/{model_name}_threshold.txt"
    with open(threshold_path, "w") as f:
        f.write(str(umbral_optimo))
    print(f"Umbral guardado en: {threshold_path}")

    # Métricas y resultados
    y_pred = (y_pred_proba >= umbral_optimo).astype(int)
    acc = (y_pred == y_true).mean()
    auc = roc_auc_score(y_true, y_pred_proba)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = tp / (tp + fp) if tp+fp > 0 else 0
    recall = tp / (tp + fn) if tp+fn > 0 else 0
    resultados.append({
        "Modelo": model_name,
        "Accuracy": round(acc,4),
        "AUC": round(auc,4),
        "Precision": round(precision,4),
        "Recall": round(recall,4),
        "Threshold": round(umbral_optimo, 3)
    })
    confusions[model_name] = confusion_matrix(y_true, y_pred)

Found 6844 images belonging to 2 classes.
Found 1467 images belonging to 2 classes.
Found 1467 images belonging to 2 classes.
Pesos de clase: {0: 1.6704906028801563, 1: 2.4914452129595923}

//////////////////////////////////////

 Entrenando modelo: MobileNetV2
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.7423 - auc: 0.8218 - loss: 1.1006
Epoch 1: val_auc improved from -inf to 0.93579, saving model to /content/modelos/MobileNetV2.keras
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m531s[0m 2s/step - accuracy: 0.7426 - auc: 0.8221 - loss: 1.0996 - val_accuracy: 0.8439 - val_auc: 0.9358 - val_loss: 0.3693
Epoch 2/30
[1m158/214[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m1:51[0m 2s/step - accuracy: 0.8611 - auc: 0.9350 - loss: 0.6727

## Gráficas

In [None]:
results_df = pd.DataFrame(resultados)

# Gráfica de barras de Accuracy, AUC, Precision, Recall
metrics = ["Accuracy", "AUC", "Precision", "Recall"]
fig, ax = plt.subplots(figsize=(10,6))
results_df.set_index("Modelo")[metrics].plot(kind='bar', ax=ax)
plt.title("Comparativa de Métricas por Modelo")
plt.ylabel("Valor")
plt.ylim(0,1)
plt.xticks(rotation=0)
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(f"{m_path}/grafica_barras_metricas.png")
plt.show()

# Gráfica de líneas del historial de entrenamiento
plt.figure(figsize=(10,6))
for model_name in histories:
    plt.plot(histories[model_name].history['val_accuracy'], label=f'{model_name} (val)')
plt.title('Evolución de la Accuracy en Validación')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

# Matriz de confusión del mejor modelo
mejor_modelo = results_df.sort_values('AUC', ascending=False)['Modelo'].iloc[0]
cm = confusions[mejor_modelo]
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['<18','18+'], yticklabels=['<18','18+'])
plt.title(f"Matriz de Confusión - {mejor_modelo}")
plt.ylabel('Real')
plt.xlabel('Predicho')
plt.tight_layout()
plt.show()