In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.imagenet_utils import preprocess_input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.metrics import Precision, Recall, AUC
import h5py
import random
import base64
import io
from PIL import Image

In [None]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# File paths
train_metadata_path = '/kaggle/input/isic-2024-challenge/train-metadata.csv'
train_image_hdf5_path = '/kaggle/input/isic-2024-challenge/train-image.hdf5'
test_metadata_path = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
test_image_hdf5_path = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

In [None]:
def load_image_from_hdf5(hdf5_path, image_id):
    with h5py.File(hdf5_path, 'r') as hdf:
        # Load the raw data
        image_data = hdf[image_id][()]
        
    # Convert the data to a numpy array
    image_array = np.frombuffer(image_data, dtype=np.uint8)
    
    # Decode the image using OpenCV
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    
    # Convert BGR to RGB (OpenCV loads images in BGR format)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    return image

In [None]:
def preprocess_image(image, target_size=(224, 224)):
    # Resize the image
    image_resized = cv2.resize(image, target_size)
    
    # Normalize the image
    image_normalized = image_resized.astype(np.float32) / 255.0
    
    return image_normalized

In [None]:
def load_and_preprocess_data(metadata_path, hdf5_path, is_train=True):
    # Load metadata
    data = pd.read_csv(metadata_path, low_memory=False)
    
    # Drop unnecessary columns
    columns_to_drop = ['patient_id', 'copyright_license', 'attribution', 'image_type', 
                       'tbp_tile_type']
    if is_train:
        columns_to_drop.append('lesion_id')
    data = data.drop(columns=columns_to_drop, errors='ignore')
    
    # Handle missing values in numeric columns
    numeric_columns = ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
                       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 
                       'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 
                       'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 
                       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 
                       'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 
                       'tbp_lv_symm_2axis', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z']
    
    imputer = SimpleImputer(strategy='median')
    data[numeric_columns] = imputer.fit_transform(data[numeric_columns])
    
    # Encode categorical variables
    categorical_columns = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
    if is_train:
        categorical_columns.extend(['iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5'])
    
    for col in categorical_columns:
        if col in data.columns:  # Check if column exists
            le = LabelEncoder()
            data[col] = data[col].fillna('Unknown')  # Fill NaN with 'Unknown'
            data[col] = le.fit_transform(data[col].astype(str))
    
    # One-hot encode relevant categorical variables
    categorical_columns_to_onehot = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
    if is_train:
        categorical_columns_to_onehot.append('iddx_1')
    data = pd.get_dummies(data, columns=categorical_columns_to_onehot)
    
    # Scale numerical features
    scaler = StandardScaler()
    data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
    
    if is_train:
        # Handle 'mel_mitotic_index' if present
        if 'mel_mitotic_index' in data.columns:
            mitotic_index_mapping = {
                '<1/mm^2': 0, '0/mm^2': 0, '1/mm^2': 1, '2/mm^2': 2, 
                '3/mm^2': 3, '4/mm^2': 4, '>4/mm^2': 5
            }
            data['mel_mitotic_index'] = data['mel_mitotic_index'].map(mitotic_index_mapping).fillna(-1)
        
        # Handle 'mel_thick_mm' if present
        if 'mel_thick_mm' in data.columns:
            data['mel_thick_mm'] = pd.to_numeric(data['mel_thick_mm'], errors='coerce').fillna(-1)
        
        # Ensure target is int
        data['target'] = data['target'].astype(int)
    
    # Reset index
    data = data.reset_index(drop=True)
    
    return data, hdf5_path

In [None]:
class HDF5DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, hdf5_path, batch_size=32, dim=(224, 224), n_channels=3, shuffle=True):
        self.data = data
        self.hdf5_path = hdf5_path
        self.batch_size = batch_size
        self.dim = dim
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.data) / float(self.batch_size)))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        batch = self.data.iloc[indexes]
        X, y = self.__data_generation(batch)
        return ({'image_input': X[0], 'tabular_input': X[1]}, y)

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.data))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, batch):
        X_img = np.empty((len(batch), *self.dim, self.n_channels))
        X_tab = np.empty((len(batch), len(self.data.columns) - 2))
        y = np.empty((len(batch), 1), dtype=int)

        for i, (_, row) in enumerate(batch.iterrows()):
            img = load_image_from_hdf5(self.hdf5_path, row['isic_id'])
            img_processed = preprocess_image(img, self.dim)
            
            X_img[i,] = img_processed
            X_tab[i,] = row.drop(['isic_id', 'target']).values
            y[i] = row['target']

        return [X_img, X_tab], y

In [None]:
# Focal Loss implementation
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1 + K.epsilon())) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()))
    return focal_loss_fixed

In [None]:
# Function to balance dataset
def balance_dataset(data, undersample_ratio=0.5):
    majority_class = data[data['target'] == 0]
    minority_class = data[data['target'] == 1]
    
    # Undersample majority class
    n_majority = int(len(minority_class) / (1 - undersample_ratio))
    majority_undersampled = resample(majority_class, 
                                     n_samples=n_majority, 
                                     random_state=42)
    
    # Combine minority class with undersampled majority class
    balanced_data = pd.concat([majority_undersampled, minority_class])
    
    return balanced_data.reset_index(drop=True)

In [None]:
def create_model(img_shape, tab_shape):
    # Image input branch
    img_input = Input(shape=img_shape, name='image_input')
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)

    # Tabular input branch
    tab_input = Input(shape=(tab_shape,), name='tabular_input')
    y = Dense(64, activation='relu')(tab_input)
    y = BatchNormalization()(y)
    y = Dropout(0.3)(y)

    # Combine branches
    combined = Concatenate()([x, y])
    z = Dense(32, activation='relu')(combined)
    z = BatchNormalization()(z)
    z = Dropout(0.3)(z)
    output = Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[img_input, tab_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=1e-4),
                  loss=focal_loss(alpha=.25, gamma=2),
                  metrics=['accuracy', AUC(name='auc'), Precision(name='precision'), Recall(name='recall')])
    
    return model

In [None]:
def train_model(train_generator, val_generator=None, n_splits=5, epochs=50):
    if val_generator is None:
        # If no validation generator is provided, use a portion of the training data
        train_data = train_generator.data
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        best_model = None
        best_auc = 0
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(train_data, train_data['target'])):
            print(f"Training fold {fold + 1}")
            
            train_data_fold = train_data.iloc[train_idx].reset_index(drop=True)
            val_data_fold = train_data.iloc[val_idx].reset_index(drop=True)
            
            train_gen_fold = HDF5DataGenerator(train_data_fold, train_generator.hdf5_path)
            val_gen_fold = HDF5DataGenerator(val_data_fold, train_generator.hdf5_path)
            
            model = create_model((224, 224, 3), train_data.shape[1] - 2)
            
            # Define callbacks
            model_checkpoint = ModelCheckpoint(
                f'best_model_fold_{fold+1}.keras',
                monitor='val_auc',
                mode='max',
                save_best_only=True,
                verbose=1
            )
            callbacks = [
                EarlyStopping(patience=10, restore_best_weights=True),
                ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6),
                model_checkpoint
            ]
            
            # Train model
            history = model.fit(
                train_gen_fold,
                validation_data=val_gen_fold,
                epochs=epochs,
                callbacks=callbacks
            )
            
            # Evaluate model
            val_loss, val_accuracy, val_auc, val_precision, val_recall = model.evaluate(val_gen_fold)
            print(f"Fold {fold + 1} - Validation Loss: {val_loss:.4f}, "
                  f"Accuracy: {val_accuracy:.4f}, AUC: {val_auc:.4f}, "
                  f"Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")
            
            # Calculate F1-score
            f1_score = 2 * (val_precision * val_recall) / (val_precision + val_recall + K.epsilon())
            print(f"F1-score: {f1_score:.4f}")
            
            # Keep track of the best model
            if val_auc > best_auc:
                best_auc = val_auc
                best_model = model
    
    else:
        # If a validation generator is provided, use it directly
        model = create_model((224, 224, 3), train_generator.data.shape[1] - 2)
        
        # Define callbacks
        model_checkpoint = ModelCheckpoint(
            'best_model.keras',
            monitor='val_auc',
            mode='max',
            save_best_only=True,
            verbose=1
        )
        callbacks = [
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6),
            model_checkpoint
        ]
        
        # Train model
        history = model.fit(
            train_generator,
            validation_data=val_generator,
            epochs=epochs,
            callbacks=callbacks
        )
        
        best_model = model
    
    # Save the overall best model
    best_model.save('best_model_overall.keras')
    print(f"Best model saved with validation AUC: {best_auc:.4f}")
    
    return best_model

In [None]:
def check_class_distribution(data):
    class_counts = data['target'].value_counts()
    class_percentages = class_counts / len(data) * 100
    
    print("Class Distribution:")
    for class_label, count in class_counts.items():
        percentage = class_percentages[class_label]
        print(f"Class {class_label}: {count} samples ({percentage:.2f}%)")
    
    imbalance_ratio = class_counts.max() / class_counts.min()
    print(f"\nImbalance Ratio: {imbalance_ratio:.2f}")

In [None]:
def evaluate_on_test_set(model, test_gen, test_metadata):
    all_predictions = []
    for i in range(len(test_gen)):
        try:
            batch = test_gen[i]
            predictions = model.predict_on_batch(batch)
            all_predictions.append(predictions)
            print(f"Successfully predicted batch {i} with shape {predictions.shape}")
        except Exception as e:
            print(f"Error predicting batch {i}: {e}")
    
    print(f"Total batches processed: {len(test_gen)}")
    print(f"Number of successful predictions: {len(all_predictions)}")
    
    if not all_predictions:
        raise ValueError("No predictions were made successfully.")
    
    predictions = np.concatenate(all_predictions).flatten()
    
    print(f"Final predictions shape: {predictions.shape}")
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'isic_id': test_metadata['isic_id'],
        'target': predictions
    })
    
    # Save submission file
    submission.to_csv('submission.csv', index=False)
    print("Predictions saved to 'submission.csv'")
    
    return predictions

In [None]:
if __name__ == "__main__":
    # Load and preprocess training data
    train_data, train_hdf5_path = load_and_preprocess_data(train_metadata_path, train_image_hdf5_path, is_train=True)
    
    print("Original data distribution:")
    check_class_distribution(train_data)
    
    # Balance the dataset (if needed)
    balanced_data = balance_dataset(train_data)
    
    print("\nBalanced data distribution:")
    check_class_distribution(balanced_data)
    
    # Create data generators
    train_generator = HDF5DataGenerator(balanced_data, train_hdf5_path)
    
    # Train model
    best_model = train_model(train_generator)
    
    # Load and preprocess test data
    test_data, test_hdf5_path = load_and_preprocess_data(test_metadata_path, test_image_hdf5_path, is_train=False)
    
    # Create test generator
    test_generator = HDF5DataGenerator(test_data, test_hdf5_path)
    
    # Make predictions on test set
    predictions = evaluate_on_test_set(best_model, test_generator, test_data)
    
    print("Predictions saved to 'submission.csv'")
