In [None]:
import os
import h5py
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from operator import itemgetter
from PIL import Image
import io
import numpy as np

In [None]:
def is_kaggle():
    return os.path.exists('/kaggle')

class Config:
    BASE_PATH = '/kaggle/input/isic-2024-challenge/' if is_kaggle() else 'isic-2024-challenge/'
    TRAIN_IMAGE_PATH = 'train-image.hdf5'
    TRAIN_METADATA_PATH = 'train-metadata.csv'
    TEST_IMAGE_PATH = 'test-image.hdf5'
    TEST_METADATA_PATH = 'test-metadata.csv'
    
    # Data processing
    IMAGE_SIZE = (224, 224)
    VALIDATION_SPLIT = 0.15
    RANDOM_STATE = 42
    
    BATCH_SIZE = 32
    
    class MetadataModule:
        ACTIVATION = 'relu'
        KERNEL_INITIALIZER = 'he_normal'
        
    class ImageModule:
        ACTIVATION = 'relu'
        KERNEL_INITIALIZER = 'he_normal'

In [None]:
def feature_engineering(df):
    eps = 1e-6
    df["lesion_size_ratio"] = np.minimum(df["tbp_lv_minorAxisMM"] / (df["clin_size_long_diam_mm"] + eps), 1.015)
    df["lesion_shape_index"] = np.minimum(df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2 + eps), 0.093)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = np.log1p(np.minimum(df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"], 1000))
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = np.minimum(df["tbp_lv_perimeterMM"] / (df["tbp_lv_areaMM2"] + eps), 6.02)
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = np.minimum(df["tbp_lv_stdL"] / (df["tbp_lv_Lext"] + eps), 0.305)
    
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = np.minimum(df["clin_size_long_diam_mm"] / (df["age_approx"] + eps), 1.59)
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    # Taken from: https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
    df["color_variance_ratio"] = np.minimum(df["tbp_lv_color_std_mean"] / (df["tbp_lv_stdLExt"] + eps), 7.94)
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = np.minimum(df["clin_size_long_diam_mm"] / (df["tbp_lv_deltaLBnorm"] + eps), 5.08)
    df["age_normalized_nevi_confidence"] = np.minimum(df["tbp_lv_nevi_confidence"] / (df["age_approx"] + eps), 9.42)
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = np.minimum(df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi) + eps), 2.64)
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    # Until here.
    
    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",
        
        "color_variance_ratio", "border_color_interaction", "size_color_contrast_ratio",
        "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
        "color_range", "shape_color_consistency", "border_length_ratio", "age_size_symmetry_index",
    ]
    new_cat_cols = ["combined_anatomical_site"]
    return df, new_num_cols, new_cat_cols

In [None]:
train_hdf5 = h5py.File(Config.BASE_PATH + Config.TRAIN_IMAGE_PATH, 'r')
test_hdf5 = h5py.File(Config.BASE_PATH + Config.TEST_IMAGE_PATH, 'r')

train_metadata = pd.read_csv(Config.BASE_PATH + Config.TRAIN_METADATA_PATH)
test_metadata = pd.read_csv(Config.BASE_PATH + Config.TEST_METADATA_PATH)

# Add features
train_metadata, new_num_cols, new_cat_cols = feature_engineering(train_metadata)
test_metadata, _, _ = feature_engineering(test_metadata)

fnames = train_metadata["isic_id"].tolist()
test_fnames = test_metadata["isic_id"].tolist()

train_target = train_metadata["target"]

split = StratifiedShuffleSplit(n_splits=1, test_size=Config.VALIDATION_SPLIT, random_state=Config.RANDOM_STATE)
for train_index, val_index in split.split(train_metadata, train_target):
    val_fnames = itemgetter(*val_index)(fnames)
    train_fnames = itemgetter(*train_index)(fnames)
    X_metadata_train, X_metadata_val = train_metadata.iloc[train_index], train_metadata.iloc[val_index]
    y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]

In [None]:
from tensorflow.keras.applications import MobileNetV3Small
import pandas as pd
import numpy as np
import csv

mobileNet = MobileNetV3Small(weights='imagenet', include_top=False, input_shape=(*Config.IMAGE_SIZE, 3), pooling='avg')

BS = 256
n_batches = len(fnames) // BS
rest = len(fnames) % BS
all_features = np.zeros((len(fnames), 576))
all_isic_ids = np.empty((len(fnames), 1), dtype=object)
all_isic_ids.fill('')

for i in range(n_batches):
    batch_imgs = []
    batch_isic_ids = []
    for fname in fnames[i*BS:(i+1)*BS]:
        image = Image.open(io.BytesIO(np.array(train_hdf5[fname])))
        image = np.array(image.resize(Config.IMAGE_SIZE)).reshape(*Config.IMAGE_SIZE, 3)
        batch_imgs.append(image)
        batch_isic_ids.append(fname)
    
    batch_imgs = np.array(batch_imgs)
    features = mobileNet.predict(batch_imgs)
    all_features[i*BS:(i+1)*BS] = features
    all_isic_ids[i*BS:(i+1)*BS] = np.array(batch_isic_ids).reshape(-1, 1)

if rest > 0:
    batch_imgs = []
    batch_isic_ids = []
    for fname in fnames[-rest:]:
        image = Image.open(io.BytesIO(np.array(train_hdf5[fname])))
        image = np.array(image.resize(Config.IMAGE_SIZE)).reshape(*Config.IMAGE_SIZE, 3)
        batch_imgs.append(image)
        batch_isic_ids.append(fname)
        
    batch_imgs = np.array(batch_imgs)
    features = mobileNet.predict(batch_imgs)
    all_features[-rest:] = features
    all_isic_ids[-rest:] = np.array(batch_isic_ids).reshape(-1, 1)

# Create a DataFrame with ISIC IDs and features
df = pd.DataFrame(all_features)
df.insert(0, 'isic_id', all_isic_ids)

# Save the DataFrame to a CSV file
name = "224x224"
df.to_csv(f'mobilenet_features_{name}.csv', index=False)
print(f"Features saved to mobilenet_features_{name}.csv. Shape: {df.shape}")

In [None]:
features_csv = pd.read_csv(f'mobilenet_features_{name}.csv')
features_csv

In [None]:
features_csv = pd.read_csv('mobilenet_features_224x224.csv')

features_without_id = features_csv.drop(columns=["isic_id"])
train_features_without_id = features_without_id.iloc[train_index].reset_index(drop=True)
val_features_without_id = features_without_id.iloc[val_index].reset_index(drop=True)
X_train_without_id = X_metadata_train.drop(columns=["isic_id"]).reset_index(drop=True)
X_val_without_id = X_metadata_val.drop(columns=["isic_id"]).reset_index(drop=True)

X_train_final = pd.concat([train_features_without_id, X_train_without_id], axis=1)
X_val_final = pd.concat([val_features_without_id, X_val_without_id], axis=1)