In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd
import h5py
import io
from PIL import Image
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from operator import itemgetter
import os
import lightgbm as lgb
import albumentations as A


INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.13 (you have 1.4.12). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.


In [2]:
def is_kaggle():
    return os.path.exists('/kaggle')

class Config:
    BASE_PATH = '/kaggle/input/isic-2024-challenge/' if is_kaggle() else 'isic-2024-challenge/'
    TRAIN_IMAGE_PATH = 'train-image.hdf5'
    TRAIN_METADATA_PATH = 'train-metadata.csv'
    TEST_IMAGE_PATH = 'test-image.hdf5'
    TEST_METADATA_PATH = 'test-metadata.csv'
    
    # Data processing
    IMAGE_SIZE = (120, 120)
    VALIDATION_SPLIT = 0.15
    RANDOM_STATE = 42
    
    BATCH_SIZE = 32

# Preprocessing

In [3]:
def feature_engineering(df):
    eps = 1e-6
    df["lesion_size_ratio"] = np.minimum(df["tbp_lv_minorAxisMM"] / (df["clin_size_long_diam_mm"] + eps), 1.015)
    df["lesion_shape_index"] = np.minimum(df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2 + eps), 0.093)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = np.log1p(np.minimum(df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"], 1000))
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = np.minimum(df["tbp_lv_perimeterMM"] / (df["tbp_lv_areaMM2"] + eps), 6.02)
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = np.minimum(df["tbp_lv_stdL"] / (df["tbp_lv_Lext"] + eps), 0.305)
    
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = np.minimum(df["clin_size_long_diam_mm"] / (df["age_approx"] + eps), 1.59)
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    # Taken from: https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
    df["color_variance_ratio"] = np.minimum(df["tbp_lv_color_std_mean"] / (df["tbp_lv_stdLExt"] + eps), 7.94)
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = np.minimum(df["clin_size_long_diam_mm"] / (df["tbp_lv_deltaLBnorm"] + eps), 5.08)
    df["age_normalized_nevi_confidence"] = np.minimum(df["tbp_lv_nevi_confidence"] / (df["age_approx"] + eps), 9.42)
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = np.minimum(df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi) + eps), 2.64)
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    # Until here.
    
    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",
        
        "color_variance_ratio", "border_color_interaction", "size_color_contrast_ratio",
        "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
        "color_range", "shape_color_consistency", "border_length_ratio", "age_size_symmetry_index",
    ]
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols

In [4]:
train_hdf5 = h5py.File(Config.BASE_PATH + Config.TRAIN_IMAGE_PATH, 'r')
test_hdf5 = h5py.File(Config.BASE_PATH + Config.TEST_IMAGE_PATH, 'r')

train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)
test_metadata = pd.read_csv(Config.BASE_PATH + Config.TEST_METADATA_PATH)

train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata)
test_metadata, _, _ = feature_engineering(test_metadata)

fnames = train_metadata["isic_id"].tolist()
test_fnames = test_metadata["isic_id"].tolist()

train_target = train_metadata["target"]

split = StratifiedShuffleSplit(n_splits=1, test_size=Config.VALIDATION_SPLIT, random_state=Config.RANDOM_STATE)
for train_index, val_index in split.split(train_metadata, train_target):
    val_fnames = itemgetter(*val_index)(fnames)
    train_fnames = itemgetter(*train_index)(fnames)
    X_metadata_train, X_metadata_val = train_metadata.iloc[train_index], train_metadata.iloc[val_index]
    y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]

  train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)


In [5]:
only_train_cols = ["target", "lesion_id", "iddx_full", "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5", "mel_mitotic_index", "mel_thick_mm", "tbp_lv_dnn_lesion_confidence"]
unuseful_cols = ["image_type", "patient_id"]
removable_cols = only_train_cols + unuseful_cols + ["isic_id"]

numeric_features = X_metadata_train.select_dtypes(include=['float64', 'int64']).columns.difference(removable_cols)
cat_features = X_metadata_train.select_dtypes(include=['object']).columns.difference(removable_cols)

numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', cat_pipeline, cat_features)
    ])

metadata_preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_metadata_preprocessed = metadata_preprocessing_pipeline.fit_transform(X_metadata_train)
X_val_metadata_preprocessed = metadata_preprocessing_pipeline.transform(X_metadata_val)
X_test_metadata_preprocessed = metadata_preprocessing_pipeline.transform(test_metadata)


In [6]:
def create_image_dataset(fnames, hdf5, targets):
    target_ds = tf.data.Dataset.from_tensor_slices(targets)
    
    def load_image(id):
        image = Image.open(io.BytesIO(np.array(hdf5[id.numpy()])))
        image = np.array(image.resize(Config.IMAGE_SIZE))
        image = (image - image.min()) / (image.max() - image.min())
        return image

    # It doesn't work without this in Kaggle
    def set_shapes(image):
        image.set_shape([*Config.IMAGE_SIZE, 3])
        return image

    # Create a dataset for images
    image_ds = tf.data.Dataset.from_tensor_slices(tf.constant(fnames))
    image_ds = image_ds.map(lambda x: tf.py_function(load_image, [x], tf.float32))
    image_ds = image_ds.map(set_shapes)
    solo_image_ds = tf.data.Dataset.zip((image_ds, target_ds))

    return solo_image_ds

train_solo_image_ds = create_image_dataset(train_fnames, train_hdf5, y_train)
val_solo_image_ds = create_image_dataset(val_fnames, train_hdf5, y_val)
zeros = np.zeros(len(test_fnames))
test_solo_image_ds = create_image_dataset(test_fnames, test_hdf5, zeros)

# Auxiliary functions and classes

In [7]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

def pauc_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    v_gt = abs(y_true - 1)
    v_pred = 1.0 - y_pred
    min_tpr = 0.80
    max_fpr = 1 - min_tpr
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

class PAUCCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super(PAUCCallback, self).__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        # Get predictions for validation data
        val_pred = self.model.predict(self.validation_data, verbose=0)
        
        # Extract true labels from validation data
        y_val = np.concatenate([y for x, y in self.validation_data], axis=0)
        
        # Calculate pAUC score
        pauc = pauc_score(y_val, val_pred)
        
        # Optionally, you can add the pAUC score to the logs
        logs['val_pauc'] = pauc

# Image Module

In [None]:
epoch_model = 6
loaded_model = tf.keras.models.load_model(f"/kaggle/input/mejores-modelos-isic-2024/last_models/models/image_model_epoch_{epoch_model:02d}_unfreezed.keras", compile=False)

In [None]:
tta_transforms = [
    A.RandomRotate90(p=1.0),
    A.Flip(p=1.0),
    A.ShiftScaleRotate(shift_limit=0.1, 
                        scale_limit=0.15, 
                        rotate_limit=60, 
                        p=1.0),
    A.HueSaturationValue(
            hue_shift_limit=0.2, 
            sat_shift_limit=0.2, 
            val_shift_limit=0.2, 
            p=1.0
        ),
    A.RandomBrightnessContrast(
            brightness_limit=(-0.1,0.1), 
            contrast_limit=(-0.1, 0.1), 
            p=1.0
        )
]

def apply_transformation(image, transformation):
    def apply_augmentation(img):
        img = img.numpy()
        img = transformation(image=img)['image']
        return img

    augmented_image = tf.py_function(apply_augmentation, [image], tf.float32)
    augmented_image.set_shape(image.shape)
    return augmented_image

def perform_tta(model, dataset, fnames, transforms, targets=None):
    tta_preds = []

    # Non-transformed predictions
    non_transformed_preds = model.predict(dataset.batch(128).prefetch(tf.data.AUTOTUNE), verbose=1)
    tta_preds.append(non_transformed_preds.flatten())

    # Predictions with transformations
    for transform in transforms:
        augmented_ds = dataset.map(lambda x, y: (apply_transformation(x, transform), y)).batch(128).prefetch(tf.data.AUTOTUNE)
        preds = model.predict(augmented_ds, verbose=1)
        tta_preds.append(preds.flatten())

    tta_preds_mean = np.mean(tta_preds, axis=0)
    if targets is not None:
        tta_pauc = pauc_score(targets, tta_preds_mean)
        print(f"TTA pAUC score: {tta_pauc:.4f}")

    tta_df = pd.DataFrame({
        'fname': fnames,
        'image_pred': tta_preds_mean
    })

    return tta_df

print("TTA for validation set:")
val_tta_df = perform_tta(loaded_model, val_solo_image_ds, val_fnames, tta_transforms, y_val)

print("TTA for training set:")
train_tta_df = perform_tta(loaded_model, train_solo_image_ds, train_fnames, tta_transforms, y_train)

print("TTA for test set:")
test_tta_df = perform_tta(loaded_model, test_solo_image_ds, test_fnames, tta_transforms)

# Save train and validation TTA predictions to CSV
train_tta_df.to_csv(f'train_tta_predictions_{epoch_model}.csv', index=False)
print(f"Saved train TTA predictions to train_tta_predictions_{epoch_model}.csv")

val_tta_df.to_csv(f'val_tta_predictions_{epoch_model}.csv', index=False)
print(f"Saved validation TTA predictions to val_tta_predictions_{epoch_model}.csv")


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 262ms/step
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 380ms/step
Validation pAUC score with TTA: 0.1340


In [None]:
# Add EfficientNet predictions to X_train_metadata_preprocessed
X_train_metadata_preprocessed_with_image = pd.concat([
    pd.DataFrame(X_train_metadata_preprocessed),
    pd.DataFrame(train_tta_df, columns=['image_pred'])
], axis=1)

X_val_metadata_preprocessed_with_image = pd.concat([
    pd.DataFrame(X_val_metadata_preprocessed),
    pd.DataFrame(val_tta_df, columns=['image_pred'])
], axis=1)

X_test_metadata_preprocessed_with_image = pd.concat([
    pd.DataFrame(X_test_metadata_preprocessed),
    pd.DataFrame(test_tta_df, columns=['image_pred'])
], axis=1)

# Convert back to numpy arrays
X_train_metadata_preprocessed_with_image = X_train_metadata_preprocessed_with_image.to_numpy()
X_val_metadata_preprocessed_with_image = X_val_metadata_preprocessed_with_image.to_numpy()
X_test_metadata_preprocessed_with_image = X_test_metadata_preprocessed_with_image.to_numpy()

print("Shape of X_train_metadata_preprocessed_with_image:", X_train_metadata_preprocessed_with_image.shape)
print("Shape of X_val_metadata_preprocessed_with_image:", X_val_metadata_preprocessed_with_image.shape)
print("Shape of X_test_metadata_preprocessed_with_image:", X_test_metadata_preprocessed_with_image.shape)


# Metadata module

In [129]:
shuffles = 2
splits = 5
models = []

for i in range(shuffles):
    print(f"Shuffle {i+1}/{shuffles}")
    split = StratifiedKFold(n_splits=splits, shuffle=True)
    for train_index, val_index in split.split(X_train_metadata_preprocessed_with_image, y_train):
        X_metadata_train_split, X_metadata_val_split = X_train_metadata_preprocessed_with_image[train_index], X_train_metadata_preprocessed_with_image[val_index]
        y_train_split, y_val_split = y_train.iloc[train_index], y_train.iloc[val_index]
        
        lgbm_model = lgb.LGBMClassifier(
            n_estimators=1500,
            max_depth=2,
            learning_rate=0.02,
            n_jobs=-1,
            verbose=-1,
        )
        
        lgbm_model.fit(X_metadata_train_split, y_train_split)

        pred = lgbm_model.predict_proba(X_metadata_val_split)
        score = pauc_score(y_val_split, pred[:, 1])
        print(f"pAUC = {score:.4f}")
        
        models.append(lgbm_model)
    
    print("\n")

Shuffle 1/2
pAUC = 0.1672
pAUC = 0.1694
pAUC = 0.1587
pAUC = 0.1415
pAUC = 0.1682


Shuffle 2/2
pAUC = 0.1607
pAUC = 0.1699
pAUC = 0.1586
pAUC = 0.1693
pAUC = 0.1606




In [133]:
print("Making predictions on the validation set:")

y_preds = []
n = 100

for i, model in enumerate(models[:n], 1):
    # Make predictions on the validation set
    y_pred = model.predict_proba(X_val_metadata_preprocessed_with_image)[:, 1]
    y_preds.append(y_pred)
    
y_preds = np.array(y_preds)

final_pred = np.mean(y_preds, axis=0)
print(pauc_score(y_val, final_pred))

Making predictions on the validation set:
0.1760042302377393


# Submission

In [None]:
submission = pd.read_csv(Config.BASE_PATH + 'sample_submission.csv')

y_preds = []
for i, model in enumerate(models, 1):
    # Make predictions on the validation set
    y_pred = model.predict_proba(X_test_metadata_preprocessed_with_image)[:, 1]
    y_preds.append(y_pred)

y_preds = np.array(y_preds)

final_pred = np.mean(y_preds, axis=0)
submission["target"] = final_pred
submission.to_csv('submission.csv', index=False)