In [1]:
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, 
                                     BatchNormalization, GlobalAveragePooling2D)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import datetime


In [2]:
df = pd.read_csv("../data/raw/HAM10000_metadata.csv")
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
df.dx.value_counts()

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

### Clasificación de lesiones de piel por tipo (`dx`)

En conjuntos de datos dermatológicos como HAM10000, las lesiones se clasifican en siete tipos según el campo `dx`. A continuación se indica cuáles se consideran benignas y cuáles malignas:

#### Lesiones benignas
- **`nv` (Nevus melanocítico)**: Lunares comunes, generalmente benignos.
- **`bkl` (Queratosis benigna)**: Incluye lentigos solares y queratosis seborreicas.
- **`df` (Dermatofibroma)**: Tumores benignos de la piel, firmes y pequeños.
- **`vasc` (Lesiones vasculares)**: Incluye angiomas y otras lesiones vasculares, consideradas benignas.

#### Lesiones malignas
- **`mel` (Melanoma)**: Cáncer de piel agresivo y potencialmente mortal.
- **`bcc` (Carcinoma basocelular)**: Cáncer de piel de crecimiento lento pero invasivo.
- **`akiec` (Queratosis actínica / carcinoma intraepidérmico)**: Lesión precancerosa o carcinoma in situ.

Esta clasificación permite agrupar las clases en dos categorías para tareas de clasificación binaria: benignas vs malignas.


In [4]:
data_dir = "../data/processed/train"

# Augmentación para entrenamiento
datagen = ImageDataGenerator(
    rescale=1./255, 
    rotation_range=0.2,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    brightness_range=[0.8,1.2],
    shear_range=0.2,
    horizontal_flip=True, # Contempla manchas simétricas
    validation_split=0.15,  # Separación interna
)

# Generador de entrenamiento
train_generator = datagen.flow_from_directory(
    data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='training',
    shuffle=True
)

val_generator = datagen.flow_from_directory(
    data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='validation',
    shuffle=False
)

print(pd.Series(val_generator.classes).value_counts())
print(pd.Series(train_generator.classes).value_counts())

Found 7662 images belonging to 2 classes.
Found 1351 images belonging to 2 classes.
0    1069
1     282
Name: count, dtype: int64
0    6058
1    1604
Name: count, dtype: int64


In [5]:
labels = train_generator.classes

# Calculamos los pesos
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

# Lo convertimos en diccionario para Keras
class_weights = dict(enumerate(class_weights))
print(class_weights)


{0: np.float64(0.6323869263783427), 1: np.float64(2.388403990024938)}


## Modelo

In [6]:
def clasificador_binario(input_shape=(224,224,3), lr=1e-3):
    entrada = Input(shape=input_shape, name='entrada_imagen')

    # Bloque 1
    x = Conv2D(32, (3,3), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(entrada)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2,2))(x)

    # Bloque 2
    x = Conv2D(64, (3,3), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2,2))(x)

    # Bloque 3
    x = Conv2D(128, (3,3), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2,2))(x)

    # Bloque 4 (extra para más capacidad)
    x = Conv2D(256, (3,3), activation='relu', padding='same', kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2,2))(x)

    # Global pooling en lugar de Flatten (reduce parámetros)
    x = GlobalAveragePooling2D()(x)

    # Capa densa
    x = Dense(128, activation='relu', kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.5)(x)

    salida = Dense(1, activation='sigmoid', name='salida_binaria')(x)

    modelo = Model(inputs=entrada, outputs=salida, name='cnn_mejorada')
    modelo.compile(optimizer=Adam(learning_rate=lr),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])
    return modelo



In [7]:
modelo = clasificador_binario()  

history = modelo.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    class_weight=class_weights
)


  self._warn_if_super_not_called()


Epoch 1/20
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 1s/step - accuracy: 0.5993 - loss: 0.7007 - val_accuracy: 0.2095 - val_loss: 1.2901
Epoch 2/20
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 2s/step - accuracy: 0.6175 - loss: 0.6145 - val_accuracy: 0.7750 - val_loss: 0.4907
Epoch 3/20
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 1s/step - accuracy: 0.6510 - loss: 0.5844 - val_accuracy: 0.5722 - val_loss: 0.7098
Epoch 4/20
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 1s/step - accuracy: 0.6587 - loss: 0.5631 - val_accuracy: 0.7572 - val_loss: 0.4661
Epoch 5/20
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 1s/step - accuracy: 0.6661 - loss: 0.5697 - val_accuracy: 0.7950 - val_loss: 0.5313
Epoch 6/20
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 2s/step - accuracy: 0.6774 - loss: 0.5558 - val_accuracy: 0.6950 - val_loss: 0.5521
Epoch 7/20
[1m240/240

In [20]:
# Current timestamp
timestamp = datetime.datetime.now().strftime("%m_%d_h%H_%M")

# Carpeta donde guardar
save_dir = "../models/classifier"
os.makedirs(save_dir, exist_ok=True)

modelo.save(f"../models/classifier/new_model_{timestamp}.keras")

In [15]:
modelo.summary()