In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout
from tensorflow.keras.applications import EfficientNetV2B0
import matplotlib.pyplot as plt
import pandas as pd
import h5py
import io
from PIL import Image
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os
import albumentations as A
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import joblib


def is_kaggle():
    return os.path.exists('/kaggle')

class Config:
    BASE_PATH = '/kaggle/input/isic-2024-challenge/' if is_kaggle() else 'isic-2024-challenge/'
    TRAIN_IMAGE_PATH = 'train-image.hdf5'
    TRAIN_METADATA_PATH = 'train-metadata.csv'
    TEST_IMAGE_PATH = 'test-image.hdf5'
    TEST_METADATA_PATH = 'test-metadata.csv'
    
    # Data processing
    IMAGE_SIZE = (120, 120)
    RANDOM_STATE = 42
    
    BATCH_SIZE = 32

np.random.seed(Config.RANDOM_STATE)

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.13 (you have 1.4.12). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.


# Preprocessing

## Feature engineering

In [2]:
def feature_engineering(df):
    eps = 1e-6
    df["lesion_size_ratio"] = np.minimum(df["tbp_lv_minorAxisMM"] / (df["clin_size_long_diam_mm"] + eps), 1.015)
    df["lesion_shape_index"] = np.minimum(df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2 + eps), 0.093)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = np.log1p(np.minimum(df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"], 1000))
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = np.minimum(df["tbp_lv_perimeterMM"] / (df["tbp_lv_areaMM2"] + eps), 6.02)
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = np.minimum(df["tbp_lv_stdL"] / (df["tbp_lv_Lext"] + eps), 0.305)
    
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = np.minimum(df["clin_size_long_diam_mm"] / (df["age_approx"] + eps), 1.59)
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    # Taken from: https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
    df["color_variance_ratio"] = np.minimum(df["tbp_lv_color_std_mean"] / (df["tbp_lv_stdLExt"] + eps), 7.94)
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = np.minimum(df["clin_size_long_diam_mm"] / (df["tbp_lv_deltaLBnorm"] + eps), 5.08)
    df["age_normalized_nevi_confidence"] = np.minimum(df["tbp_lv_nevi_confidence"] / (df["age_approx"] + eps), 9.42)
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = np.minimum(df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi) + eps), 2.64)
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    # Until here.
    
    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",
        
        "color_variance_ratio", "border_color_interaction", "size_color_contrast_ratio",
        "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
        "color_range", "shape_color_consistency", "border_length_ratio", "age_size_symmetry_index",
    ]
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols

## Load data

In [3]:
train_hdf5 = h5py.File(Config.BASE_PATH + Config.TRAIN_IMAGE_PATH, 'r')
test_hdf5 = h5py.File(Config.BASE_PATH + Config.TEST_IMAGE_PATH, 'r')

train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)
test_metadata = pd.read_csv(Config.BASE_PATH + Config.TEST_METADATA_PATH)

# Add features
X_metadata_train, new_num_cols, new_cat_cols = feature_engineering(train_metadata)
X_metadata_test, _, _ = feature_engineering(test_metadata)

fnames = train_metadata["isic_id"]
test_fnames = test_metadata["isic_id"]

target = train_metadata["target"]

  train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)[:5000]


In [4]:
only_train_cols = ["target", "lesion_id", "iddx_full", "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5", "mel_mitotic_index", "mel_thick_mm", "tbp_lv_dnn_lesion_confidence"]
unuseful_cols = ["image_type", "patient_id"]
removable_cols = only_train_cols + unuseful_cols + ["isic_id"]

numeric_features = X_metadata_train.select_dtypes(include=['float64', 'int64']).columns.difference(removable_cols)
cat_features = X_metadata_train.select_dtypes(include=['object']).columns.difference(removable_cols)

numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', cat_pipeline, cat_features)
    ])

metadata_preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_train_metadata_preprocessed = metadata_preprocessing_pipeline.fit_transform(X_metadata_train)
X_test_metadata_preprocessed = metadata_preprocessing_pipeline.transform(X_metadata_test)


In [5]:
def create_image_dataset(fnames, targets, hdf5):
    target_ds = tf.data.Dataset.from_tensor_slices(targets)
    
    def load_image(id):
        image = Image.open(io.BytesIO(np.array(hdf5[id.numpy()])))
        image = np.array(image.resize(Config.IMAGE_SIZE))
        image = (image - image.min()) / (image.max() - image.min())
        return image

    # It doesn't work without this in Kaggle
    def set_shapes(image):
        image.set_shape([*Config.IMAGE_SIZE, 3])
        return image

    # Create a dataset for images
    image_ds = tf.data.Dataset.from_tensor_slices(tf.constant(fnames))
    image_ds = image_ds.map(lambda x: tf.py_function(load_image, [x], tf.float32))
    image_ds = image_ds.map(set_shapes)
    solo_image_ds = tf.data.Dataset.zip((image_ds, target_ds))

    return solo_image_ds

train_solo_image_ds = create_image_dataset(fnames, target, train_hdf5)

2024-08-08 17:57:02.544646: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-08-08 17:57:02.544662: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-08-08 17:57:02.544666: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-08-08 17:57:02.544694: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-08 17:57:02.544712: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
train_transforms = A.Compose([
        A.RandomRotate90(p=0.5),
        A.Flip(p=0.5),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        ], p=1.)

def augment_image(image, label):
    def apply_augmentation(img):
        img = img.numpy()
        img = train_transforms(image=img)['image']
        return img

    augmented_image = tf.py_function(apply_augmentation, [image], tf.float32)
    augmented_image.set_shape(image.shape)
    return augmented_image, label

# Apply the augmentation to the dataset
train_solo_image_augmented_ds = train_solo_image_ds.map(augment_image).batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Auxiliary functions and classes

In [7]:
def pauc_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    v_gt = abs(y_true - 1)
    v_pred = 1.0 - y_pred
    min_tpr = 0.80
    max_fpr = 1 - min_tpr
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

class PAUCCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super(PAUCCallback, self).__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        # Get predictions for validation data
        val_pred = self.model.predict(self.validation_data, verbose=0)
        
        # Extract true labels from validation data
        y_val = np.concatenate([y for x, y in self.validation_data], axis=0)
        
        # Calculate pAUC score
        pauc = pauc_score(y_val, val_pred)
        
        # Optionally, you can add the pAUC score to the logs
        logs['val_pauc'] = pauc

# Training

In [8]:
# efficientNet = EfficientNetV2B0(weights='imagenet', pooling='avg', include_top=False)

# image_input = tf.keras.Input(shape=(*Config.IMAGE_SIZE, 3))
# x = efficientNet(image_input)
# x = Dense(512, kernel_initializer='he_normal')(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.5)(x)
# x = Dense(128, kernel_initializer='he_normal')(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.3)(x)
# x = Dense(1, activation='sigmoid')(x)
# image_model = tf.keras.Model(inputs=image_input, outputs=x)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout, Input, Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D, SeparableConv2D

image_model = Sequential([
    Input(shape=(*Config.IMAGE_SIZE, 3)),
    
    # Initial convolution block
    Conv2D(32, 3, padding='same', activation='relu'),
    BatchNormalization(),
    Conv2D(32, 3, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2),
    Dropout(0.1),
    
    # Second convolution block
    Conv2D(64, 3, padding='same', activation='relu'),
    BatchNormalization(),
    Conv2D(64, 3, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2),
    Dropout(0.2),
    
    # Third convolution block with separable convolutions
    SeparableConv2D(128, 3, padding='same', activation='relu'),
    BatchNormalization(),
    SeparableConv2D(128, 3, padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2),
    Dropout(0.3),
    
    # Fourth convolution block with separable convolutions
    SeparableConv2D(256, 3, padding='same', activation='relu'),
    BatchNormalization(),
    SeparableConv2D(256, 3, padding='same', activation='relu'),
    BatchNormalization(),
    GlobalAveragePooling2D(),
    Dropout(0.4),
    
    # Dense layers
    Dense(256, activation='relu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

image_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 120, 120, 32)      896       
                                                                 
 batch_normalization (Batch  (None, 120, 120, 32)      128       
 Normalization)                                                  
                                                                 
 conv2d_5 (Conv2D)           (None, 120, 120, 32)      9248      
                                                                 
 batch_normalization_1 (Bat  (None, 120, 120, 32)      128       
 chNormalization)                                                
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 60, 60, 32)        0         
 g2D)                                                            
                                                      

## CNN

In [10]:
# Create a reduced set of training and validation for the CNN
target_reset = target.reset_index(drop=True)

# Get the positive instances
positive_indices = np.where(target_reset == 1)[0]
positive_count = len(positive_indices)

negative_ratio = 100
negative_indices = np.where(target_reset == 0)[0]
negative_count = negative_ratio * positive_count
negative_indices = np.random.choice(negative_indices, size=negative_count, replace=False)

# Combine positive and selected negative indices
selected_indices = np.concatenate([positive_indices, negative_indices])
np.random.shuffle(selected_indices)

# Create the reduced datasets
fnames_reduced = fnames[selected_indices].reset_index(drop=True)
target_reduced = target_reset[selected_indices].reset_index(drop=True)

print(f"Total train samples: {len(selected_indices)}")
print(f"Positive train samples: {positive_count}")
print(f"Negative train samples: {negative_count}")

def augment_image(image, label):
    def apply_augmentation(img):
        img = img.numpy()
        img = train_transforms(image=img)['image']
        return img

    augmented_image = tf.py_function(apply_augmentation, [image], tf.float32)
    augmented_image.set_shape(image.shape)
    return augmented_image, label

Total train samples: 306
Positive train samples: 6
Negative train samples: 300


In [11]:
# Define number of folds
n_splits = 5

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=Config.RANDOM_STATE)

oof_predictions = np.zeros(len(X_train_metadata_preprocessed))
fold_scores = []
image_models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(fnames_reduced, target_reduced), 1):
    print(f"Training fold {fold}")
    
    train_fnames, val_fnames = fnames_reduced[train_idx], fnames_reduced[val_idx]
    y_train, y_val = target_reduced[train_idx], target_reduced[val_idx]
    
    ds = create_image_dataset(train_fnames, y_train, train_hdf5)
    ds = ds.map(augment_image).batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    val_ds = create_image_dataset(val_fnames, y_val, train_hdf5)
    val_ds = val_ds.batch(Config.BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    model = tf.keras.models.clone_model(image_model)
    model.set_weights(image_model.get_weights())
    
    lr = 2e-4
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr) if is_kaggle() else tf.keras.optimizers.legacy.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    
    history = model.fit(
        ds,
        epochs=10,
        validation_data=val_ds,
        callbacks=[PAUCCallback(val_ds)]
    )
    
    image_models.append(model)
    
    model.save(f'image_models/model_fold_{fold}.h5')
    print(f"Model for fold {fold} saved successfully.")
    
    # TODO: Agregar TTA
    val_predictions = model.predict(val_ds)
    oof_predictions[val_idx] = val_predictions.ravel()
    
# Calculate overall OOF pauc score
oof_score = pauc_score(target, oof_predictions)
print(f"Overall OOF pAUC: {oof_score}")
# Save OOF predictions
np.save('oof_predictions.npy', oof_predictions)
print("OOF predictions saved successfully.")



Training fold 1


2024-08-08 17:57:03.288920: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-08-08 17:57:04.849564: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-08 17:57:05.181893: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Model for fold 1 saved successfully.
Fold 1 pAUC: 0.09999999999999996
Training fold 2


  saving_api.save_model(
2024-08-08 17:57:05.826835: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-08-08 17:57:07.193434: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-08 17:57:07.429746: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Model for fold 2 saved successfully.
Fold 2 pAUC: 0.16666666666666663
Training fold 3


  saving_api.save_model(
2024-08-08 17:57:08.058921: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-08-08 17:57:09.509514: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-08 17:57:09.780308: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Model for fold 3 saved successfully.
Fold 3 pAUC: 0.16999999999999996
Training fold 4


  saving_api.save_model(
2024-08-08 17:57:10.403974: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-08-08 17:57:11.770712: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-08 17:57:12.030949: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Model for fold 4 saved successfully.
Fold 4 pAUC: 0.1966666666666666
Training fold 5


  saving_api.save_model(
2024-08-08 17:57:12.692059: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-08-08 17:57:14.036473: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-08-08 17:57:14.437683: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Model for fold 5 saved successfully.
Fold 5 pAUC: 0.04333333333333334
Overall OOF pAUC: 0.018774529435322374
Mean of fold scores: 0.13533333333333328
OOF predictions saved successfully.


  saving_api.save_model(


In [12]:
# Make predictions on the test set
test_ds = create_image_dataset(test_fnames, np.zeros(len(test_fnames)), test_hdf5)
test_ds = test_ds.batch(128).prefetch(tf.data.AUTOTUNE)

test_predictions = np.zeros((len(test_fnames), n_splits))

for i, model in enumerate(image_models, 1):
    print(f"Predicting with model from fold {i}")
    
    fold_predictions = model.predict(test_ds)
    test_predictions[:, i-1] = fold_predictions.ravel()

# Average predictions across folds
test_predictions_mean = np.mean(test_predictions, axis=1)

Predicting with model from fold 1


2024-08-08 17:57:14.975685: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Predicting with model from fold 2


2024-08-08 17:57:15.187644: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Predicting with model from fold 3


2024-08-08 17:57:15.399460: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Predicting with model from fold 4


2024-08-08 17:57:15.591133: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Predicting with model from fold 5


2024-08-08 17:57:15.782602: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




## LGBM

In [13]:
X_train_final = X_train_metadata_preprocessed.copy()
X_train_final = np.column_stack((X_train_final, oof_predictions))

X_test_final = X_test_metadata_preprocessed.copy()
X_test_final = np.column_stack((X_test_final, test_predictions_mean))

In [14]:
shuffles = 2
splits = 5
oof_predictions = np.zeros(len(X_train_final))
models = []

for i in range(shuffles):
    print(f"Shuffle {i+1}/{shuffles}")
    split = StratifiedKFold(n_splits=splits, shuffle=True)
    for j, (train_index, val_index) in enumerate(split.split(X_train_final, target), 1):
        X_train, X_val = X_train_final[train_index], X_train_final[val_index]
        y_train, y_val = target.iloc[train_index], target.iloc[val_index]
        
        lgbm_model = lgb.LGBMClassifier(
            n_estimators=1500,
            max_depth=2,
            learning_rate=0.02,
            n_jobs=-1,
            verbose=-1,
        )
        
        lgbm_model.fit(X_train, y_train)
        
        # Save the model for this fold
        fold_model_filename = f'lgbm_models/model_shuffle_{i+1}_fold_{j}.joblib'
        joblib.dump(lgbm_model, fold_model_filename)
        print(f"Model saved as {fold_model_filename}")

        pred = lgbm_model.predict_proba(X_val)
        oof_predictions[val_index] += pred[:, 1]
        score = pauc_score(y_val, pred[:, 1])
        print(f"pAUC = {score:.4f}")
        
        models.append(lgbm_model)
    
    print("\n")
    
oof_score = pauc_score(target, oof_predictions)
print(f"Overall OOF pAUC: {oof_score}")

Shuffle 1/2
Model saved as lgbm_models/model_shuffle_1_fold_1.joblib
pAUC = 0.1998
Model saved as lgbm_models/model_shuffle_1_fold_2.joblib
pAUC = 0.1998
Model saved as lgbm_models/model_shuffle_1_fold_3.joblib
pAUC = 0.1934
Model saved as lgbm_models/model_shuffle_1_fold_4.joblib
pAUC = 0.1876
Model saved as lgbm_models/model_shuffle_1_fold_5.joblib
pAUC = 0.1964


Shuffle 2/2
Model saved as lgbm_models/model_shuffle_2_fold_1.joblib
pAUC = 0.1998
Model saved as lgbm_models/model_shuffle_2_fold_2.joblib
pAUC = 0.1998
Model saved as lgbm_models/model_shuffle_2_fold_3.joblib
pAUC = 0.1986
Model saved as lgbm_models/model_shuffle_2_fold_4.joblib
pAUC = 0.1980
Model saved as lgbm_models/model_shuffle_2_fold_5.joblib
pAUC = 0.1595


Overall OOF pAUC: 0.18673074355893735


In [16]:
# ranks = []
# for model in models:
#     # Get feature importances
#     feature_importances = model.feature_importances_

#     # Create a dataframe of feature importances
#     feature_importance_df = pd.DataFrame({
#         'feature': model.feature_name_,
#         'importance': feature_importances
#     })

#     feature = "Column_138"
#     sorted_importance = feature_importance_df.sort_values('importance', ascending=False).reset_index(drop=True)
#     feature_rank = sorted_importance[sorted_importance['feature'] == feature].index[0] + 1
#     ranks.append(feature_rank)

# print(f"Average rank: {np.mean(ranks)}")

In [17]:
submission = pd.read_csv(Config.BASE_PATH + 'sample_submission.csv')

y_preds = []
for i, model in enumerate(models, 1):
    # Make predictions on the validation set
    y_pred = model.predict_proba(X_test_final)[:, 1]
    y_preds.append(y_pred)

y_preds = np.array(y_preds)

final_pred = np.mean(y_preds, axis=0)
submission["target"] = final_pred
submission.to_csv('submission.csv', index=False)