In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd
import h5py
import io
from PIL import Image
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from operator import itemgetter
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
import os

In [2]:
def is_kaggle():
    return os.path.exists('/kaggle')

class Config:
    BASE_PATH = '/kaggle/input/isic-2024-challenge/' if is_kaggle() else 'isic-2024-challenge/'
    TRAIN_IMAGE_PATH = 'train-image.hdf5'
    TRAIN_METADATA_PATH = 'train-metadata.csv'
    TEST_IMAGE_PATH = 'test-image.hdf5'
    TEST_METADATA_PATH = 'test-metadata.csv'
    
    # Data processing
    IMAGE_SIZE = (120, 120)
    VALIDATION_SPLIT = 0.15
    RANDOM_STATE = 42
    
    BATCH_SIZE = 32
    
    class MetadataModule:
        ACTIVATION = 'relu'
        KERNEL_INITIALIZER = 'he_normal'
        
    class ImageModule:
        ACTIVATION = 'relu'
        KERNEL_INITIALIZER = 'he_normal'

# Preprocesamiento

In [3]:
train_hdf5 = h5py.File(Config.BASE_PATH + Config.TRAIN_IMAGE_PATH, 'r')
test_hdf5 = h5py.File(Config.BASE_PATH + Config.TEST_IMAGE_PATH, 'r')

train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)
test_metadata = pd.read_csv(Config.BASE_PATH + Config.TEST_METADATA_PATH)

train_fnames = train_metadata["isic_id"].tolist()
test_fnames = test_metadata["isic_id"].tolist()

train_target = train_metadata["target"]

split = StratifiedShuffleSplit(n_splits=1, test_size=Config.VALIDATION_SPLIT, random_state=Config.RANDOM_STATE)
for train_index, val_index in split.split(train_metadata, train_target):
    val_fnames = itemgetter(*val_index)(train_fnames)
    train_fnames = itemgetter(*train_index)(train_fnames)
    X_metadata_train, X_metadata_val = train_metadata.iloc[train_index], train_metadata.iloc[val_index]
    y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]

  train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)


In [4]:
only_train_cols = ["target", "lesion_id", "iddx_full", "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5", "mel_mitotic_index", "mel_thick_mm", "tbp_lv_dnn_lesion_confidence"]
unuseful_cols = ["image_type", "patient_id"]
removable_cols = only_train_cols + unuseful_cols + ["isic_id"]

numeric_features = train_metadata.select_dtypes(include=['float64', 'int64']).columns.difference(removable_cols)
cat_features = train_metadata.select_dtypes(include=['object']).columns.difference(removable_cols)

numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', cat_pipeline, cat_features)
    ])

metadata_preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_metadata_preprocessed = metadata_preprocessing_pipeline.fit_transform(X_metadata_train)
X_val_metadata_preprocessed = metadata_preprocessing_pipeline.transform(X_metadata_val)
X_test_metadata_preprocessed = metadata_preprocessing_pipeline.transform(test_metadata)


In [5]:
def create_dataset(fnames, metadata_preprocessed, targets, hdf5):
    target_ds = tf.data.Dataset.from_tensor_slices(targets)
    
    def load_image(id):
        image = Image.open(io.BytesIO(np.array(hdf5[id.numpy()])))
        image = np.array(image.resize(Config.IMAGE_SIZE)).reshape(120, 120, 3)
        return image

    # It doesn't work without this in Kaggle
    def set_shapes(image):
        image.set_shape([120, 120, 3])
        return image

    # Create a dataset for images
    image_ds = tf.data.Dataset.from_tensor_slices(tf.constant(fnames))
    image_ds = image_ds.map(lambda x: tf.py_function(load_image, [x], tf.float32))
    image_ds = image_ds.map(set_shapes)
    solo_image_ds = tf.data.Dataset.zip((image_ds, target_ds)).batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    # Create a dataset for metadata
    metadata_ds = tf.data.Dataset.from_tensor_slices(metadata_preprocessed)
    solo_metadata_ds = tf.data.Dataset.zip((metadata_ds, target_ds)).batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    # Combine the datasets
    combined_ds = tf.data.Dataset.zip(((image_ds, metadata_ds), target_ds))
    combined_ds = combined_ds.shuffle(1000)
    combined_ds = combined_ds.batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    return solo_image_ds, solo_metadata_ds, combined_ds

train_solo_image_ds, train_solo_metadata_ds, train_ds = create_dataset(train_fnames, X_train_metadata_preprocessed, y_train, train_hdf5)
val_solo_image_ds, val_solo_metadata_ds, val_ds = create_dataset(val_fnames, X_val_metadata_preprocessed, y_val, train_hdf5)

# TEST
test_solo_image_ds, test_solo_metadata_ds, test_ds = create_dataset(test_fnames, X_test_metadata_preprocessed, np.zeros(len(test_fnames)), test_hdf5)

# Auxiliary functions and classes

In [6]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

def pauc_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    v_gt = abs(y_true - 1)
    v_pred = 1.0 - y_pred
    min_tpr = 0.80
    max_fpr = 1 - min_tpr
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

class PAUCCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data, batch_size):
        super(PAUCCallback, self).__init__()
        self.validation_data = validation_data
        self.batch_size = batch_size

    def on_epoch_end(self, epoch, logs=None):
        # Get predictions for validation data
        val_pred = self.model.predict(self.validation_data, verbose=0)
        
        # Extract true labels from validation data
        y_val = np.concatenate([y for x, y in self.validation_data], axis=0)
        
        # Calculate pAUC score
        pauc = pauc_score(y_val, val_pred)
        
        # Optionally, you can add the pAUC score to the logs
        logs['val_pauc'] = pauc

# Metadata module

In [224]:
from sklearn.model_selection import KFold

metadata_input_shape = next(iter(train_solo_metadata_ds.take(1)))[0].shape[1:]

shuffles = 3
splits = 5
models = []
scores = []

for i in range(shuffles):
    print(f"Shuffle {i+1}/{shuffles}")
    split = KFold(n_splits=splits, random_state=Config.RANDOM_STATE, shuffle=True)
    for j, (train_index, val_index) in enumerate(split.split(X_metadata_train, y_train), 1):
        print(f"Split {j}/{splits}")
        X_metadata_train_split, X_metadata_val_split = X_metadata_train.iloc[train_index], X_metadata_train.iloc[val_index]
        y_train_split, y_val_split = y_train.iloc[train_index], y_train.iloc[val_index]
        
        X_train_metadata_preprocessed_split = metadata_preprocessing_pipeline.fit_transform(X_metadata_train_split)
        X_val_metadata_preprocessed_split = metadata_preprocessing_pipeline.transform(X_metadata_val_split)
        
        _, train_solo_metadata_split_ds, _ = create_dataset(train_fnames, X_train_metadata_preprocessed_split, y_train_split, train_hdf5)
        _, val_solo_metadata_split_ds, _ = create_dataset(val_fnames, X_val_metadata_preprocessed_split, y_val_split, train_hdf5)
        
        # metadata_model = Sequential([
        #     Dense(64, activation=Config.MetadataModule.ACTIVATION, kernel_initializer=Config.MetadataModule.KERNEL_INITIALIZER, input_shape=metadata_input_shape),
        #     Dense(32, activation=Config.MetadataModule.ACTIVATION, kernel_initializer=Config.MetadataModule.KERNEL_INITIALIZER),
        #     Dense(1, activation='sigmoid')
        # ])

        # # Compile the model
        # lr = 3e-4
        # optimizer = tf.keras.optimizers.Adam(learning_rate=lr) if is_kaggle() else tf.keras.optimizers.legacy.Adam(learning_rate=lr)
        # metadata_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        # # Callbacks
        # pauc_callback = PAUCCallback(val_solo_metadata_split_ds, Config.BATCH_SIZE)
        # early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_pauc', patience=3, mode='max')
        
        # history = metadata_model.fit(train_solo_metadata_split_ds, validation_data=val_solo_metadata_split_ds, epochs=8, callbacks=[pauc_callback, early_stopping])
        
        xgb_model = xgboost.XGBClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1, max_depth=3)
        xgb_model.fit(X_train_metadata_preprocessed_split, y_train_split)

        pred = xgb_model.predict_proba(X_val_metadata_preprocessed_split)
        score = pauc_score(y_val_split, pred[:, 1])
        print(f"pAUC = {score:.4f}\n")
        
        scores.append(score)
        models.append(xgb_model)
                
        # scores.append(history.history['val_pauc'])
        # models.append(metadata_model)

Shuffle 1/3
Split 1/5
pAUC = 0.1690
Split 2/5
pAUC = 0.1218
Split 3/5
pAUC = 0.1505
Split 4/5
pAUC = 0.1493
Split 5/5
pAUC = 0.1570
Shuffle 2/3
Split 1/5
pAUC = 0.1690
Split 2/5
pAUC = 0.1218
Split 3/5
pAUC = 0.1505
Split 4/5
pAUC = 0.1493
Split 5/5
pAUC = 0.1570
Shuffle 3/3
Split 1/5
pAUC = 0.1690
Split 2/5
pAUC = 0.1218
Split 3/5
pAUC = 0.1505
Split 4/5
pAUC = 0.1493
Split 5/5
pAUC = 0.1570


In [264]:
# Get the indices of the sorted scores
sorted_indices = [i for i, _ in sorted(enumerate(scores), key=lambda x: x[1], reverse=True)]

best_models = [models[i] for i in sorted_indices]

print("Making predictions on the validation set:")

y_preds = []
for i, model in enumerate(best_models, 1):
    # Make predictions on the validation set
    y_pred = model.predict_proba(X_val_metadata_preprocessed)[:, 1]
    
    y_preds.append(y_pred)

y_preds = np.array(y_preds)

n = 15
final_pred = np.mean(y_preds[:n], axis=0)
print(pauc_score(y_val, final_pred))

Making predictions on the validation set:
0.17020042866409085


In [227]:
best_models = []
best_scores = []

# Combine models and scores into tuples for sorting
model_score_pairs = list(zip(models, scores))

# Sort the pairs based on the maximum pAUC score (assuming higher is better)
# sorted_pairs = sorted(model_score_pairs, key=lambda x: max(x[1]), reverse=True)
sorted_pairs = sorted(model_score_pairs, key=lambda x: x[1], reverse=True)

best_models = [pair[0] for pair in sorted_pairs]
best_scores = [pair[1] for pair in sorted_pairs]

print(f"Models sorted based on validation pAUC scores:")
for i, (model, score) in enumerate(zip(best_models, best_scores), 1):
    print(f"Model {i}: Max pAUC = {max(score):.4f}")

TypeError: '<' not supported between instances of 'XGBClassifier' and 'XGBClassifier'

In [239]:
print("Making predictions on the validation set:")

y_preds = []
for i, model in enumerate(best_models, 1):
    # Make predictions on the validation set
    y_pred = model.predict(X_val_metadata_preprocessed)
    
    y_preds.append(y_pred)

y_preds = np.array(y_preds)

Making predictions on the validation set:


In [238]:
n = 1
final_pred = np.mean(y_preds[:n], axis=0)
print(pauc_score(y_val, final_pred))

0.019999001663893505


In [223]:
import xgboost

# xgb_model = xgboost.XGBClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1, max_depth=3)
# xgb_model.fit(X_train_metadata_preprocessed, y_train)

# pred = xgb_model.predict_proba(X_val_metadata_preprocessed)
# print(pauc_score(y_val, pred[:, 1]))

xgb_model = xgboost.XGBClassifier(n_estimators=100, random_state=Config.RANDOM_STATE, n_jobs=-1, max_depth=3)
xgb_model.fit(train_solo_metadata_split_ds)

# pred = xgb_model.predict_proba(X_val_metadata_preprocessed)
# print(pauc_score(y_val, pred[:, 1]))

TypeError: XGBClassifier.fit() missing 1 required positional argument: 'y'

# Image Module

In [11]:
# image_input_shape = next(iter(train_image_ds.take(1))).shape

# image_model = Sequential([
#     Conv2D(32, 3, 2, activation=Config.MetadataModule.ACTIVATION, kernel_initializer=Config.MetadataModule.KERNEL_INITIALIZER, input_shape=image_input_shape),
#     Conv2D(16, 3, 2, activation=Config.MetadataModule.ACTIVATION, kernel_initializer=Config.MetadataModule.KERNEL_INITIALIZER),
#     MaxPooling2D(2, 2),
#     Flatten(),
#     Dense(64, activation=Config.MetadataModule.ACTIVATION, kernel_initializer=Config.MetadataModule.KERNEL_INITIALIZER),
#     Dense(1, activation='sigmoid')
# ])

# # Compile the model
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
# image_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# # Callbacks
# pauc_callback = PAUCCallback(val_solo_image_ds.take(100), Config.BATCH_SIZE)

# # Display model summary
# image_model.summary()



In [12]:
# image_model.fit(train_solo_image_ds.take(500), validation_data=val_solo_image_ds.take(100), epochs=1, callbacks=[pauc_callback])

# Combined Modules

In [13]:
# image_input = tf.keras.Input(shape=image_input_shape)
# metadata_input = tf.keras.Input(shape=metadata_input_shape)

# # Clone and freeze image model layers
# # x_image = image_input
# # for layer in image_model.layers[:-1]:  # Exclude the last layer
# #     x_image = layer(x_image)
# #     layer.trainable = False

# # Clone and freeze metadata model layers
# x_metadata = metadata_input
# for layer in metadata_model.layers[:-1]:  # Exclude the last layer
#     x_metadata = layer(x_metadata)
#     layer.trainable = False

# # Concatenate the outputs of both models
# # combined = tf.keras.layers.Concatenate()([x_image, x_metadata])
# x = tf.keras.layers.Dense(16, activation=Config.MetadataModule.ACTIVATION, kernel_initializer=Config.MetadataModule.KERNEL_INITIALIZER)(x_metadata)
# x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

# # Define inputs
# input = [image_input, metadata_input]

# combined_model = tf.keras.Model(inputs=input, outputs=x)

# pauc_callback = PAUCCallback(val_ds.take(100), Config.BATCH_SIZE)

# # Compile the model
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002)
# combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
# combined_model.fit(train_ds.take(500), validation_data=val_ds.take(100), epochs=1, callbacks=[pauc_callback])

# # Unfreeze layers in the image model
# for layer in combined_model.layers:
#     if isinstance(layer, tf.keras.Model) and layer.name == image_model.name:
#         for sub_layer in layer.layers:
#             sub_layer.trainable = True

# # Unfreeze layers in the metadata model
# for layer in combined_model.layers:
#     if isinstance(layer, tf.keras.Model) and layer.name == metadata_model.name:
#         for sub_layer in layer.layers:
#             sub_layer.trainable = True

# # Recompile the model with a lower learning rate for fine-tuning
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
# combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# print("Layers unfrozen and model recompiled for fine-tuning.")

# combined_model.fit(train_ds.take(500), validation_data=val_ds.take(100), epochs=1, callbacks=[pauc_callback])

# Submission

In [15]:
submission = pd.read_csv(Config.BASE_PATH + 'sample_submission.csv')

# zero_labels = tf.data.Dataset.from_tensor_slices(tf.zeros(len(test_ds)))
# test_ds_with_zeros = tf.data.Dataset.zip((test_ds, zero_labels))
zero_labels = tf.data.Dataset.from_tensor_slices(tf.zeros(len(test_metadata_ds)))
test_ds_with_zeros = tf.data.Dataset.zip((test_metadata_ds, zero_labels)).batch(Config.BATCH_SIZE)

# submission["target"] = combined_model.predict(test_ds_with_zeros)
# submission["target"] = metadata_model.predict(test_ds_with_zeros)
# submission["target"] = xgb_model.predict_proba(X_test_metadata_preprocessed)[:, 1]
pred = np.zeros(len(test_metadata_ds))
for model in best_models:
    pred += model.predict(test_ds_with_zeros).reshape(-1)
submission["target"] = pred
submission.to_csv('submission.csv', index=False)

NameError: name 'test_metadata_ds' is not defined