In [1]:
import os
import numpy as np
import plotly.express as px

from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras.applications import VGG16, InceptionV3, ResNet50
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, GlobalAveragePooling2D, BatchNormalization, Add, ReLU, Lambda
from tensorflow.keras.models import Model

AUTOTUNE = tf.data.AUTOTUNE
IMG_SIZE = (128, 128)
EMBED_DIM = 128
BATCH_SIZE = 100
EPOCHS = 150
TEMPERATURE = 0.05


2025-11-19 18:51:24.566674: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763578284.589489    1018 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763578284.596399    1018 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
import os

def contar_archivos_en_carpetas(directorio):
    # Recorre todas las carpetas dentro del directorio
    for carpeta in os.listdir(directorio):
        ruta_carpeta = os.path.join(directorio, carpeta)
        if os.path.isdir(ruta_carpeta):
            # Cuenta solo archivos (no subcarpetas)
            archivos = [f for f in os.listdir(ruta_carpeta) 
                        if os.path.isfile(os.path.join(ruta_carpeta, f))]
            print(f"Carpeta: {carpeta} -> {len(archivos)} archivos")

# Ejemplo de uso
directorio_base = "/kaggle/input/hampreprocessed/malignas_classes/train"  # Cambia esto por tu ruta
contar_archivos_en_carpetas(directorio_base)


Carpeta: mel -> 1002 archivos
Carpeta: akiec -> 296 archivos
Carpeta: bcc -> 461 archivos


In [3]:
import os

def contar_archivos_por_clase(directorio_base):
    clases_totales = {}  # acumulador por clase

    for conjunto in ["train", "test"]:
        ruta_conjunto = os.path.join(directorio_base, conjunto)
        if not os.path.exists(ruta_conjunto):
            print(f"No existe la carpeta: {ruta_conjunto}")
            continue

        print(f"\nConjunto: {conjunto}")
        for carpeta in os.listdir(ruta_conjunto):
            ruta_carpeta = os.path.join(ruta_conjunto, carpeta)
            if os.path.isdir(ruta_carpeta):
                archivos = [f for f in os.listdir(ruta_carpeta) 
                            if os.path.isfile(os.path.join(ruta_carpeta, f))]
                cantidad = len(archivos)
                print(f"  Carpeta: {carpeta} -> {cantidad} archivos")

                # acumular por clase
                if carpeta not in clases_totales:
                    clases_totales[carpeta] = 0
                clases_totales[carpeta] += cantidad

    # Mostrar suma total por clase
    print("\nSuma total por clase (train + test):")
    for clase, total in clases_totales.items():
        print(f"  {clase} -> {total} archivos")

# Ejemplo de uso
directorio_base = "/kaggle/input/hampreprocessed/processed"  # Ruta base que contiene train y test
contar_archivos_por_clase(directorio_base)



Conjunto: train
  Carpeta: benignas -> 7254 archivos
  Carpeta: malignas -> 1759 archivos

Conjunto: test
  Carpeta: benignas -> 807 archivos
  Carpeta: malignas -> 195 archivos

Suma total por clase (train + test):
  benignas -> 8061 archivos
  malignas -> 1954 archivos


## Importaci贸n de datos

In [4]:
data_dir = "/kaggle/input/hampreprocessed/malignas_classes/train"

def get_generators(data_dir, preprocess_fn, target_size=IMG_SIZE, batch_size=BATCH_SIZE, validation_split=0.15):
    datagen = ImageDataGenerator(
        preprocessing_function=preprocess_fn,
        rotation_range=60,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.12,
        brightness_range=[0.8, 1.2],
        shear_range=0.2,
        vertical_flip=True,
        horizontal_flip=True,
        validation_split=validation_split
    )

    train_generator = datagen.flow_from_directory(
        data_dir,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='categorical',
        subset='training',
        shuffle=True
    )

    val_generator = datagen.flow_from_directory(
        data_dir,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='categorical',
        subset='validation',
        shuffle=False
    )

    return train_generator, val_generator

In [5]:
train_generator, val_generator = get_generators(data_dir, lambda x: x/255.)
num_classes = len(train_generator.class_indices)
class_names = list(train_generator.class_indices.keys())

Found 1496 images belonging to 3 classes.
Found 263 images belonging to 3 classes.


## Modelo generador de embeddings

In [6]:
def contrastive_encoder(input_shape=(IMG_SIZE[0],IMG_SIZE[0],3), embedding_dim=EMBED_DIM):
    inputs = Input(shape=input_shape)

    # Bloque 1
    x = Conv2D(64, 3, padding='same', use_bias=False)(inputs)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv2D(64, 3, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    s = Conv2D(64, 1, padding='same', use_bias=False)(inputs)
    s = BatchNormalization()(s)
    x = Add()([x, s])
    x = ReLU()(x)
    x = MaxPooling2D()(x)

    # Bloque 2
    y = Conv2D(128, 3, padding='same', use_bias=False)(x)
    y = BatchNormalization()(y)
    y = ReLU()(y)
    y = Conv2D(128, 3, padding='same', use_bias=False)(y)
    y = BatchNormalization()(y)
    s2 = Conv2D(128, 1, padding='same', use_bias=False)(x)
    s2 = BatchNormalization()(s2)
    y = Add()([y, s2])
    y = ReLU()(y)
    y = MaxPooling2D()(y)

    # Bloque 3
    z = Conv2D(256, 3, padding='same', use_bias=False)(y)
    z = BatchNormalization()(z)
    z = ReLU()(z)
    z = Conv2D(256, 3, padding='same', use_bias=False)(z)
    z = BatchNormalization()(z)
    s3 = Conv2D(256, 1, padding='same', use_bias=False)(y)
    s3 = BatchNormalization()(s3)
    z = Add()([z, s3])
    z = ReLU()(z)

    z = GlobalAveragePooling2D()(z)
    z = Dense(512, activation='relu')(z)
    z = BatchNormalization()(z)

    # Proyecci贸n (cabeza contrastiva)
    p = Dense(EMBED_DIM, activation='relu')(z)
    p = Dense(EMBED_DIM)(p)
    outputs = Lambda(lambda t: tf.math.l2_normalize(t, axis=1), name="proj_norm")(p)

    return Model(inputs, outputs, name="ContrastiveEncoder")


In [7]:
class SupConLoss(tf.keras.losses.Loss):
    def __init__(self, temperature=0.1, name="supcon"):
        super().__init__(name=name)
        self.temperature = temperature

    def call(self, y_true, features):
        """
        SupConLoss implementation.
        Args:
            y_true: [batch] integer class labels (not one-hot).
            features: [batch, dim] embeddings.
        """
        # Normalize embeddings
        features = tf.math.l2_normalize(features, axis=1)
        batch_size = tf.shape(features)[0]

        # Similarity matrix
        sim = tf.matmul(features, features, transpose_b=True)  # [B, B]
        sim = sim / self.temperature

        # Ensure labels are integers, not one-hot
        if y_true.shape.ndims > 1 and y_true.shape[-1] > 1:
            y_true = tf.argmax(y_true, axis=-1)

        labels = tf.reshape(y_true, [-1, 1])  # [B, 1]
        mask = tf.equal(labels, tf.transpose(labels))  # [B, B]
        mask = tf.cast(mask, tf.float32)

        # Remove self-contrast
        eye = tf.eye(batch_size, dtype=tf.float32)
        logits_mask = tf.ones_like(mask) - eye
        mask = mask * logits_mask

        # Log-softmax denominator excluding self
        sim_max = tf.reduce_max(sim, axis=1, keepdims=True)
        sim = sim - sim_max
        exp_sim = tf.exp(sim) * logits_mask
        denom = tf.reduce_sum(exp_sim, axis=1, keepdims=True) + 1e-9
        log_prob = sim - tf.math.log(denom)

        # Average log-prob of positives per anchor
        pos_count = tf.reduce_sum(mask, axis=1) + 1e-9
        mean_log_pos = tf.reduce_sum(mask * log_prob, axis=1) / pos_count

        loss = -tf.reduce_mean(mean_log_pos)
        return loss


## Entrenar representaciones

In [8]:
def train_supcon(model, train_generator, val_generator, loss_fn, optimizer, epochs=50):
    steps_per_epoch = train_generator.samples // train_generator.batch_size
    validation_steps = val_generator.samples // val_generator.batch_size

    train_loss = tf.keras.metrics.Mean(name="train_loss")
    val_loss = tf.keras.metrics.Mean(name="val_loss")

    for epoch in range(epochs):
        train_loss.reset_state()
        val_loss.reset_state()

        # Entrenamiento
        for _ in range(steps_per_epoch):
            images, labels = next(train_generator)
            with tf.GradientTape() as tape:
                embeddings = model(images, training=True)
                loss = loss_fn(labels, embeddings)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            train_loss.update_state(loss)

        # Validaci贸n
        for _ in range(validation_steps):
            images, labels = next(val_generator)
            embeddings = model(images, training=False)
            loss = loss_fn(labels, embeddings)
            val_loss.update_state(loss)

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss.result():.4f} - Val Loss: {val_loss.result():.4f}")


In [None]:
encoder = contrastive_encoder(embedding_dim=EMBED_DIM)
loss_fn = SupConLoss(temperature=TEMPERATURE)
optimizer = Adam(learning_rate=8e-4)
train_supcon(encoder, train_generator, val_generator, loss_fn, optimizer, epochs=120)

I0000 00:00:1763578306.890464    1018 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
I0000 00:00:1763578308.958754    1018 cuda_dnn.cc:529] Loaded cuDNN version 90300


Epoch 1/120 - Train Loss: 6.2461 - Val Loss: 4.6079


## Entrenar clasificador

In [None]:
x = encoder.output
clf = Dense(num_classes, activation="softmax")(x)
classifier = Model(encoder.input, clf)
classifier.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])


In [None]:
labels = train_generator.classes
class_weights = dict(enumerate(compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)))

classifier.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    class_weight=class_weights
)


## Evaluate KNN

In [None]:
def evaluate_knn(model, val_generator, class_names, k=5):
    embs, labs = [], []
    # recorrer todo el generador de validaci贸n
    for imgs, labels in val_generator:
        e = model(imgs, training=False).numpy()
        embs.append(e)
        # si labels es one-hot, convertir a entero con argmax
        if labels.ndim > 1:
            labs.append(np.argmax(labels, axis=1))
        else:
            labs.append(labels)

    # concatenar embeddings y etiquetas
    X = np.concatenate(embs, axis=0)
    y = np.concatenate(labs, axis=0)

    # entrenar y evaluar KNN
    knn = KNeighborsClassifier(n_neighbors=k, metric='cosine')
    knn.fit(X, y)
    y_pred = knn.predict(X)

    print(classification_report(y, y_pred, target_names=class_names))
    print(confusion_matrix(y, y_pred))


In [None]:
data_dir = "/ruta/a/HAM10000"  # carpetas por clase
evaluate_knn(encoder, val_generator, class_names, k=7)  # reutiliza val_ds y class_names


In [None]:
def visualize_embeddings_3d(model, val_ds, class_names, method="tsne"):
    # 1. Extraer embeddings y etiquetas
    embs, labs = [], []
    for imgs, labels in val_ds:
        e = model(imgs, training=False).numpy()
        embs.append(e)
        labs.append(labels.numpy())
    X = np.concatenate(embs, axis=0)
    y = np.concatenate(labs, axis=0)

    # 2. Reducir a 3D
    if method == "tsne":
        reducer = TSNE(n_components=3, perplexity=30, learning_rate=200, random_state=42)
    else:
        reducer = PCA(n_components=3)
    X_reduced = reducer.fit_transform(X)

    # 3. Visualizar con Plotly
    fig = px.scatter_3d(
        x=X_reduced[:,0], y=X_reduced[:,1], z=X_reduced[:,2],
        color=[class_names[i] for i in y],
        title=f"Embeddings en 3D ({method.upper()})",
        opacity=0.7
    )
    fig.show()
