In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import cv2
import gc
import logging
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import Precision, Recall, AUC
import h5py
import math
from tensorflow.keras.utils import Progbar
import time
from tensorflow.keras import mixed_precision

2024-08-29 17:54:44.550115: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-29 17:54:44.550242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-29 17:54:44.701036: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Set memory growth for GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [4]:
# Set mixed precision policy
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [5]:
# File paths
train_metadata_path = '/kaggle/input/isic-2024-challenge/train-metadata.csv'
train_image_hdf5_path = '/kaggle/input/isic-2024-challenge/train-image.hdf5'
test_metadata_path = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
test_image_hdf5_path = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

In [6]:
def load_image_from_hdf5(hdf5_path, image_id):
    with h5py.File(hdf5_path, 'r') as hdf:
        # Convert image_id to string if it's not already
        image_id = str(image_id)
        
        # Check if the image_id exists in the file
        if image_id not in hdf:
            raise KeyError(f"Image ID {image_id} not found in HDF5 file")
        
        # Load the raw data
        image_data = hdf[image_id][()]
        
    # Convert the data to a numpy array
    image_array = np.frombuffer(image_data, dtype=np.uint8)
    
    # Decode the image using OpenCV
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    
    # Convert BGR to RGB (OpenCV loads images in BGR format)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    return image

In [7]:
def preprocess_image(image, target_size=(224, 224)):
    # Resize the image
    image_resized = cv2.resize(image, target_size)
    
    # Normalize the image
    image_normalized = image_resized.astype(np.float32) / 255.0
    
    return image_normalized

In [8]:
def load_and_preprocess_data(metadata_path, hdf5_path, is_train=True, train_columns=None, train_encoders=None):
    try:
        data = pd.read_csv(metadata_path, low_memory=False)

        # Handle empty dataset
        if data.empty:
            raise ValueError("Empty dataset")

        # Store 'isic_id' separately
        isic_ids = data['isic_id']

        # Drop unnecessary columns
        columns_to_drop = ['patient_id', 'copyright_license', 'attribution', 'image_type', 
                           'tbp_tile_type', 'lesion_id']
        data = data.drop(columns=columns_to_drop, errors='ignore')

        # Handle missing values in numeric columns
        numeric_columns = ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
                           'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 
                           'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 
                           'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 
                           'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 
                           'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 
                           'tbp_lv_symm_2axis', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z']

        if is_train:
            imputer = SimpleImputer(strategy='median')
            data[numeric_columns] = imputer.fit_transform(data[numeric_columns])
        else:
            # Use the imputer fitted on training data
            data[numeric_columns] = train_encoders['imputer'].transform(data[numeric_columns])

        # Encode categorical variables
        categorical_columns = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
        if is_train:
            categorical_columns.extend(['iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5'])
            label_encoders = {}
            for col in categorical_columns:
                if col in data.columns:
                    le = LabelEncoder()
                    data[col] = data[col].fillna('Unknown')
                    data[col] = le.fit_transform(data[col].astype(str))
                    label_encoders[col] = le
        else:
            # Use the label encoders fitted on training data
            for col in categorical_columns:
                if col in data.columns:
                    data[col] = data[col].fillna('Unknown')
                    data[col] = train_encoders['label_encoders'][col].transform(data[col].astype(str))

        # One-hot encode relevant categorical variables
        categorical_columns_to_onehot = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
        if is_train:
            data = pd.get_dummies(data, columns=categorical_columns_to_onehot)
            train_columns = data.columns
        else:
            # For test data, add missing columns
            for col in train_columns:
                if col not in data.columns:
                    data[col] = 0
            # Ensure test data has the same columns as train data
            data = data[train_columns]

        # Scale numerical features
        if is_train:
            scaler = StandardScaler()
            data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
        else:
            # Use the scaler fitted on training data
            data[numeric_columns] = train_encoders['scaler'].transform(data[numeric_columns])

        if is_train:
            # Handle 'mel_mitotic_index' if present
            if 'mel_mitotic_index' in data.columns:
                mitotic_index_mapping = {
                    '<1/mm^2': 0, '0/mm^2': 0, '1/mm^2': 1, '2/mm^2': 2, 
                    '3/mm^2': 3, '4/mm^2': 4, '>4/mm^2': 5
                }
                data['mel_mitotic_index'] = data['mel_mitotic_index'].map(mitotic_index_mapping).fillna(-1)

            # Handle 'mel_thick_mm' if present
            if 'mel_thick_mm' in data.columns:
                data['mel_thick_mm'] = pd.to_numeric(data['mel_thick_mm'], errors='coerce').fillna(-1)

        # Reset index
        data = data.reset_index(drop=True)

        # Print column names for debugging
        print(f"{'Train' if is_train else 'Test'} columns:", data.columns)

        if is_train:
            train_encoders = {
                'imputer': imputer,
                'label_encoders': label_encoders,
                'scaler': scaler
            }
            features = data
            labels = features.pop('target')
            return features, labels, isic_ids, hdf5_path, train_columns, train_encoders
        else:
            features = data
            return features, isic_ids, hdf5_path

    except Exception as e:
        logger.error(f"Error in load_and_preprocess_data: {str(e)}")
        raise

In [9]:
def augment_image(image):
    try:
        # Convert to tensor if it's not already
        image = tf.convert_to_tensor(image)
    
        # Random horizontal flip
        image = tf.image.random_flip_left_right(image)
        
        # Random vertical flip
        image = tf.image.random_flip_up_down(image)
        
        # Random rotation
        image = tf.image.rot90(image, k=tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
        
        # Random brightness adjustment
        image = tf.image.random_brightness(image, max_delta=0.2)
        
        # Random contrast adjustment
        image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
        
        # Random hue adjustment
        image = tf.image.random_hue(image, max_delta=0.2)
        
        # Random saturation adjustment
        image = tf.image.random_saturation(image, lower=0.8, upper=1.2)
        
        # Ensure pixel values are in [0, 1] range
        image = tf.clip_by_value(image, 0, 1)
        
        # Add more augmentations here
        image = tf.image.random_crop(image, [200, 200, 3])
        image = tf.image.resize(image, [224, 224])
    
        return image
    except Exception as e:
        print(f"Error in augment_image: {str(e)}")
        return image  # Return original image if augmentation fails

In [10]:
def create_dataset(features, labels, hdf5_path, batch_size, is_train=True):
    def generator():
        for i in range(len(features)):
            try:
                # Use 'isic_id' instead of index
                img_id = features.iloc[i]['isic_id']
                img = load_image_from_hdf5(hdf5_path, img_id)
                img_processed = preprocess_image(img)
                tab_data = features.iloc[i].drop('isic_id').values
                if is_train:
                    yield (img_processed, tab_data), labels.iloc[i]
                else:
                    yield (img_processed, tab_data)
            except Exception as e:
                print(f"Error processing sample {i} with ID {img_id}: {str(e)}")
                continue

    # Update output signature if necessary
    output_signature = (
        (tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
         tf.TensorSpec(shape=(features.shape[1] - 1,), dtype=tf.float32)),  # Subtract 1 for 'isic_id'
        tf.TensorSpec(shape=(), dtype=tf.int32)
    ) if is_train else (
        (tf.TensorSpec(shape=(224, 224, 3), dtype=tf.float32),
         tf.TensorSpec(shape=(features.shape[1] - 1,), dtype=tf.float32))  # Subtract 1 for 'isic_id'
    )

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=output_signature
    )

    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    if is_train:
        dataset = dataset.shuffle(buffer_size=len(features)).repeat()

    # Materialize a small part of the dataset to ensure it's not empty
    for _ in dataset.take(1):
        break

    print(f"Dataset created successfully with {len(features)} samples")
    return dataset

In [11]:
# Focal Loss implementation
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1 + K.epsilon())) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()))
    return focal_loss_fixed

In [12]:
def balance_dataset(features, labels, undersample_ratio=0.5):
    print(f"Starting balance_dataset with {len(features)} samples")
    majority_class = features[labels == 0]
    minority_class = features[labels == 1]
    print(f"Majority class: {len(majority_class)}, Minority class: {len(minority_class)}")
    
    n_majority = int(len(minority_class) / (1 - undersample_ratio))
    print(f"Number of majority samples to keep: {n_majority}")
    
    majority_undersampled = resample(majority_class, n_samples=n_majority, random_state=42)
    
    balanced_features = pd.concat([majority_undersampled, minority_class])
    balanced_labels = pd.Series([0]*len(majority_undersampled) + [1]*len(minority_class))
    
    print(f"Balanced dataset size: {len(balanced_features)}")
    return balanced_features.reset_index(drop=True), balanced_labels.reset_index(drop=True)


In [13]:
def hyperparameter_tuning(train_data, train_hdf5_path, val_data, val_hdf5_path):
    learning_rates = [1e-3, 1e-4, 1e-5]
    batch_sizes = [16, 32, 64]
    best_auc = 0
    best_params = {}
    
    for lr in learning_rates:
        for bs in batch_sizes:
            print(f"Training with learning rate: {lr}, batch size: {bs}")
            model = train_model(train_data, train_hdf5_path, val_data, val_hdf5_path, n_splits=5, epochs=30, batch_size=bs, learning_rate=lr)
            
            # Evaluate the model
            val_gen = HDF5DataGenerator(val_data, val_hdf5_path, batch_size=bs)
            val_dataset = create_dataset(val_gen, val_data, bs)
            _, _, val_auc, _, _ = model.evaluate(val_dataset)
            
            if val_auc > best_auc:
                best_auc = val_auc
                best_params = {'learning_rate': lr, 'batch_size': bs}
    
    print(f"Best parameters: {best_params}")
    return best_params

In [14]:
def create_model(img_shape, tab_shape, learning_rate=1e-4):
    print(f"Creating model with img_shape={img_shape}, tab_shape={tab_shape}")
    base_model = tf.keras.applications.EfficientNetB0(input_shape=img_shape, include_top=False, weights=None)
    print("Base model created without pre-trained weights")
    x = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
    print("Global average pooling added")
    
    # Tabular input branch
    tab_input = Input(shape=(tab_shape,))
    y = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(tab_input)
    y = BatchNormalization()(y)
    y = Dropout(0.5)(y)

    # Combine branches
    combined = Concatenate()([x, y])
    z = Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(combined)
    z = BatchNormalization()(z)
    z = Dropout(0.5)(z)
    output = Dense(1, activation='sigmoid', dtype='float32')(z)

    model = Model(inputs=[base_model.input, tab_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss=focal_loss(alpha=.25, gamma=2),
                  metrics=['accuracy', AUC(name='auc'), Precision(name='precision'), Recall(name='recall')])
    print("Model compiled")
    return model

In [15]:
def train_model(features, labels, hdf5_path, n_splits=5, epochs=30, batch_size=16, learning_rate=1e-4):
    print(f"Starting model training with {len(features)} samples")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    fold_models = []
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(features, labels)):
        print(f"Processing fold {fold + 1}/{n_splits}")
        
        train_features = features.iloc[train_idx]
        train_labels = labels.iloc[train_idx]
        val_features = features.iloc[val_idx]
        val_labels = labels.iloc[val_idx]
        
        print(f"Creating train dataset for fold {fold + 1}")
        train_dataset = create_dataset(train_features, train_labels, hdf5_path, batch_size=batch_size, is_train=True)
        print(f"Creating validation dataset for fold {fold + 1}")
        val_dataset = create_dataset(val_features, val_labels, hdf5_path, batch_size=batch_size, is_train=True)
        
        print(f"Creating model for fold {fold + 1}")
        model = create_model((224, 224, 3), features.shape[1], learning_rate=learning_rate)
        
        # Define callbacks
        model_checkpoint = ModelCheckpoint(
            f'best_model_fold_{fold+1}.keras',
            monitor='val_auc',
            mode='max',
            save_best_only=True,
            verbose=1
        )
        callbacks = [
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6),
            model_checkpoint
        ]
        
        # Calculate steps per epoch
        steps_per_epoch = math.ceil(len(train_features) / batch_size)
        validation_steps = math.ceil(len(val_features) / batch_size)
        
        print(f"Starting training for fold {fold + 1}")
        try:
            history = model.fit(
                train_dataset,
                validation_data=val_dataset,
                epochs=epochs,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps,
                callbacks=callbacks,
                verbose=1  # Add this to see training progress
            )
            print(f"Fold {fold + 1} training completed")
            
            # Evaluate model
            print(f"Evaluating model for fold {fold + 1}")
            val_loss, val_accuracy, val_auc, val_precision, val_recall = model.evaluate(val_dataset, steps=validation_steps)
            print(f"Fold {fold + 1} - Validation Loss: {val_loss:.4f}, "
                  f"Accuracy: {val_accuracy:.4f}, AUC: {val_auc:.4f}, "
                  f"Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")
            
            # Calculate F1-score
            f1_score = 2 * (val_precision * val_recall) / (val_precision + val_recall + K.epsilon())
            print(f"F1-score: {f1_score:.4f}")
            
            fold_models.append(model)
            fold_scores.append(val_auc)
            
        except Exception as e:
            print(f"Error during training in fold {fold + 1}: {str(e)}")
            import traceback
            print(traceback.format_exc())
        finally:
            # Clear GPU memory after training
            tf.keras.backend.clear_session()
            gc.collect()
    
    # Print overall results
    print("\nCross-validation results:")
    for fold, score in enumerate(fold_scores):
        print(f"Fold {fold + 1}: AUC = {score:.4f}")
    print(f"Mean AUC: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")
    
    return fold_models, fold_scores

In [16]:
def ensemble_predict(models, test_dataset):
    all_predictions = []
    for model in models:
        predictions = model.predict(test_dataset)
        all_predictions.append(predictions)
    
    # Average predictions from all models
    ensemble_predictions = np.mean(all_predictions, axis=0)
    return ensemble_predictions

In [17]:
def check_class_distribution(data):
    print("Inside check_class_distribution function")
    print(f"Data type: {type(data)}")
    print(f"Data shape or length: {data.shape if hasattr(data, 'shape') else len(data)}")
    
    if isinstance(data, pd.DataFrame) and 'target' in data.columns:
        class_counts = data['target'].value_counts()
    elif isinstance(data, pd.Series):
        class_counts = data.value_counts()
    else:
        print("Unexpected data format in check_class_distribution")
        return

    class_percentages = class_counts / len(data) * 100
    
    print("Class Distribution:")
    for class_label, count in class_counts.items():
        percentage = class_percentages[class_label]
        print(f"Class {class_label}: {count} samples ({percentage:.2f}%)")
    
    if len(class_counts) > 1:
        imbalance_ratio = class_counts.max() / class_counts.min()
        print(f"\nImbalance Ratio: {imbalance_ratio:.2f}")
    else:
        print("\nOnly one class present, cannot calculate imbalance ratio.")

In [18]:
def evaluate_on_test_set(model, test_gen, test_metadata):
    all_predictions = []
    for i in range(len(test_gen)):
        try:
            batch = test_gen[i]
            # Print batch information for debugging
            print(f"Batch {i} shapes - Image: {batch[0]['image_input'].shape}, Tabular: {batch[0]['tabular_input'].shape}")
            predictions = model.predict(batch[0], verbose=0)
            all_predictions.append(predictions)
            print(f"Successfully predicted batch {i} with shape {predictions.shape}")
        except Exception as e:
            print(f"Error predicting batch {i}: {str(e)}")
            # Print more detailed error information
            import traceback
            print(traceback.format_exc())
    
    print(f"Total batches processed: {len(test_gen)}")
    print(f"Number of successful predictions: {len(all_predictions)}")
    
    if not all_predictions:
        raise ValueError("No predictions were made successfully. Check the error messages above for more details.")
    
    predictions = np.concatenate(all_predictions).flatten()
    
    print(f"Final predictions shape: {predictions.shape}")
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'isic_id': test_metadata['isic_id'],
        'target': predictions
    })
    
    # Save submission file
    submission.to_csv('submission.csv', index=False)
    print("Predictions saved to 'submission.csv'")
    
    return predictions

In [19]:
def ensemble_predict(models, test_dataset):
    predictions = []
    for model in models:
        model_preds = model.predict(test_dataset)
        predictions.append(model_preds)
    return np.mean(predictions, axis=0)

In [20]:
def predict_on_test_data(model, test_dataset, steps, timeout=3600):
    predictions = []
    start_time = time.time()
    
    try:
        for i, batch in enumerate(test_dataset.take(steps)):
            if time.time() - start_time > timeout:
                logger.warning(f"Prediction timed out after {timeout} seconds")
                break
            
            img_batch, tab_batch = batch
            batch_predictions = model.predict([img_batch, tab_batch], verbose=0)
            predictions.extend(batch_predictions.flatten())
            
            if i % 10 == 0:
                logger.info(f"Predicted batch {i}/{steps}")
            
            # Clear GPU memory if needed
            tf.keras.backend.clear_session()
            gc.collect()
    
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
    
    return np.array(predictions)


In [21]:
if __name__ == "__main__":
    try:
        logging.basicConfig(level=logging.DEBUG)
        logger = logging.getLogger(__name__)

        print("Starting data loading and preprocessing...")
        train_features, train_labels, train_isic_ids, train_hdf5_path, train_columns, train_encoders = load_and_preprocess_data(train_metadata_path, train_image_hdf5_path, is_train=True)
        print("Data loading and preprocessing completed.")

        print("Checking original class distribution...")
        check_class_distribution(train_labels)
        print("Original class distribution check completed.")

        print("Starting dataset balancing...")
        balanced_features, balanced_labels, balanced_isic_ids = balance_dataset(train_features, train_labels, train_isic_ids)
        print("Dataset balancing completed.")

        print("Checking balanced class distribution...")
        check_class_distribution(balanced_labels)
        print("Balanced class distribution check completed.")

        print("Creating dataset...")
        train_dataset = create_dataset(balanced_features, balanced_isic_ids, balanced_labels, train_hdf5_path, batch_size=16, is_train=True)
        print("Dataset creation completed.")

        print("Starting model training")
        fold_models, fold_scores = train_model(balanced_features, balanced_isic_ids, balanced_labels, train_hdf5_path, n_splits=5, epochs=30, batch_size=16, learning_rate=1e-4)
        print("Model training completed")
        
        # Process test data
        test_features, test_isic_ids, test_hdf5_path = load_and_preprocess_data(
            test_metadata_path, test_image_hdf5_path, 
            is_train=False, train_columns=train_columns, 
            train_encoders=train_encoders
        )
        
        # Create test dataset
        batch_size = 16
        test_dataset = create_dataset(test_features, test_isic_ids, None, test_hdf5_path, batch_size=batch_size, is_train=False)
        
        # Calculate steps for prediction
        test_steps = math.ceil(len(test_features) / batch_size)
        
        # Make ensemble predictions
        predictions = ensemble_predict(fold_models, test_dataset, test_steps)
        
        # Handle potential issues with predictions
        if len(predictions) == 0:
            logger.error("No predictions were made")
            predictions = np.zeros(len(test_features))  # Default predictions
        elif len(predictions) != len(test_features):
            logger.warning(f"Mismatch in prediction length. Padding with zeros.")
            padding = np.zeros(len(test_features) - len(predictions))
            predictions = np.concatenate([predictions, padding])
        
        # Create submission DataFrame
        submission = pd.DataFrame({
            'isic_id': test_isic_ids,
            'target': predictions
        })
        
        # Handle any potential NaN values
        submission['target'] = submission['target'].fillna(0.5)  # Fill NaNs with 0.5
        
        # Clip predictions to [0, 1] range
        submission['target'] = np.clip(submission['target'], 0, 1)
        
        # Save submission file
        submission.to_csv('submission.csv', index=False)
        logger.info("Predictions saved to 'submission.csv'")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        # Create a default submission in case of critical error
        test_data = pd.read_csv(test_metadata_path)
        default_submission = pd.DataFrame({
            'isic_id': test_data['isic_id'],
            'target': [0.5] * len(test_data)  # Default prediction of 0.5 for all samples
        })
        default_submission.to_csv('submission.csv', index=False)
        logger.info("Default submission saved due to error")

Starting data loading and preprocessing...
Train columns: Index(['isic_id', 'target', 'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A',
       'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext',
       'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2',
       'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA',
       'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
       'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
       'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
       'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
       'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
       'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'iddx_full', 'iddx_1', 'iddx_2',
       'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm',
       'tbp_lv_dnn_lesion_confidence', 'sex_0', 'sex_1', 'sex_2',
       'anatom_site_general_0', 'anatom_site_general_1',
 