In [None]:
#

# Prediction from resnet

In [8]:
import torch
import warnings
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import h5py
import numpy as np
import io
import logging
import os
import torch.nn as nn
import torch.nn.functional as F


logging.basicConfig(level=logging.INFO)
warnings.simplefilter("ignore")
kaggle = False

class HDF5Dataset(Dataset):
    def __init__(self, hdf5_file, transform=None):
        self.hdf5_file = hdf5_file
        self.transform = transform
        self.file = h5py.File(hdf5_file, 'r')
        self.keys = list(self.file.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        image_name = self.keys[idx]
        image_data = self.file[image_name][()]
        image = Image.open(io.BytesIO(image_data)).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, image_name

    def close(self):
        self.file.close()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


if kaggle:
    hdf5_file = '/kaggle/input/isic-2024-challenge/test-image.hdf5'
    model_saved_path = '/kaggle/input/resnet18-vsloss/resnet18_vsloss_v2.pt'
else:
    hdf5_file = 'dataset/dump/train-image.hdf5'
    model_saved_path = 'checkpoint/13 July 13:46-resnet18-VS_loss-unbalanced_datasetv2.pt'


dataset = HDF5Dataset(hdf5_file=hdf5_file, transform=transform)
test_loader = DataLoader(dataset, batch_size=64, shuffle=False)

model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
model.to(device)


if os.path.exists(model_saved_path):
    model.load_state_dict(torch.load(model_saved_path, map_location=device))
    logging.info(f'Model loaded from {model_saved_path}')
else:
    logging.info('Model not found')

model.eval()
resnet_predictions, image_ids = [], []
count = 0

with torch.no_grad():
    for images, ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        probabilities = F.softmax(outputs, dim=1)
        resnet_predictions.extend(probabilities[:, 1].cpu().numpy())
        image_ids.extend(ids)
        count += 1
        if count >= 10:
            break

dataset.close()

# Ensure predictions are not zero unless expected by checking and handling zero values
resnet_predictions = [max(pred, 1e-5) for pred in resnet_predictions]

INFO:root:Model loaded from checkpoint/13 July 13:46-resnet18-VS_loss-unbalanced_datasetv2.pt


In [9]:
def create_submission(test_ids, predictions, filename='submission.csv'):
    submission = pd.DataFrame({
        'isic_id': test_ids,
        'target': predictions
    })
    submission.to_csv(filename, index=False)
    print(f'Submission saved to {filename}')
    return submission

# Assuming you have the test IDs ready
create_submission(image_ids, resnet_predictions)

Submission saved to submission.csv


Unnamed: 0,isic_id,target
0,ISIC_0015670,0.000000e+00
1,ISIC_0015845,0.000000e+00
2,ISIC_0015864,3.044883e-26
3,ISIC_0015902,7.689489e-04
4,ISIC_0024200,1.401298e-45
...,...,...
635,ISIC_0088838,0.000000e+00
636,ISIC_0088843,1.359260e-43
637,ISIC_0088870,3.326699e-15
638,ISIC_0088915,0.000000e+00


# Prediction from catboost

In [5]:


############## Catboost ####################
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, roc_curve
from sklearn import preprocessing
from sklearn.impute import KNNImputer
import warnings
import joblib






# load data from csv files

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f}'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100*(start_mem - end_mem) / start_mem))
    
    return df


def load_data( Kaggle=False, debug=False):
    if Kaggle:
        train_meta = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
        test_meta = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv')
    else:
        train_meta = pd.read_csv('dataset/dump/train-metadata.csv')
        test_meta = pd.read_csv('dataset/dump/test-metadata.csv')
    if debug:
        train_meta = train_meta[:80*1000]
    return reduce_mem_usage(train_meta), reduce_mem_usage(test_meta)



import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer

def preprocess_and_impute_data(df, columns_to_drop):
    # Drop specified columns safely
    df = df.drop(columns=columns_to_drop, errors='ignore')
    
    # Specific handling for the 'age_approx' column using KNN
    if 'age_approx' in df.columns and df['age_approx'].isnull().any():
        print('Processing column: age_approx with KNN')
        knn_imputer = KNNImputer(n_neighbors=5)
        age_approx_values = df[['age_approx']]
        df['age_approx'] = knn_imputer.fit_transform(age_approx_values).ravel()  # Reshape to 1D array

    # Handle missing data for other numeric columns using mean imputation
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'age_approx' in numeric_cols:
        numeric_cols.remove('age_approx')
    for col in numeric_cols:
        if df[col].isnull().any():
            print(f'Processing column: {col} with mean imputation')
            mean_imputer = SimpleImputer(strategy='mean')
            df[col] = mean_imputer.fit_transform(df[[col]]).ravel()  # Reshape to 1D array
    
    # Handle missing data for categorical columns using median imputation
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        if df[col].isnull().any():
            print(f'Processing column: {col} with median imputation')
            median_imputer = SimpleImputer(strategy='most_frequent')
            df[col] = median_imputer.fit_transform(df[[col]]).ravel()  # Reshape to 1D array

    return df



def feature_engineering(df):
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4
    return df

import pandas as pd
from sklearn import preprocessing



from sklearn.metrics import roc_auc_score
def calculate_pauc(y_true, y_scores, tpr_threshold=0.8):
    # Calculate pAUC using sklearn's roc_auc_score with max_fpr
    partial_auc_scaled = roc_auc_score(y_true, y_scores, max_fpr=tpr_threshold)

    # Scale from [0.5, 1.0] to [0.0, 0.2]
    partial_auc = (partial_auc_scaled - 0.5) * 0.4
    return partial_auc



def train_model(train_df, categorical_columns):
    # Assuming 'target' is the column name for your target variable
    X = train_df.drop(columns=['target'])
    y = train_df['target']
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    pauc_scores = []
    models = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.1,
            depth=6,
            cat_features=categorical_columns,
            eval_metric='AUC',
            random_seed=42,
            verbose=100,
            early_stopping_rounds=100
        )

        model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)
        
        test_pred = model.predict_proba(X_test)[:, 1]
        
        models.append(model)

        pauc = calculate_pauc(y_test, test_pred)
        pauc_scores.append(pauc)

    print(f'Average pAUC score: {np.mean(pauc_scores):.4f}')
    
    return models

def predict(models, test_meta_df_le):
    submit_score = []
    for model in models:
        pred_ = model.predict_proba(test_meta_df_le)[:, 1]
        submit_score.append(pred_)
    submit_pred = np.mean(submit_score, axis=0)
    return submit_pred

def create_submission(test_id, submit_pred, filename='submission.csv'):
    submission = pd.DataFrame({
        'isic_id': test_id,
        'target': submit_pred
    })
    submission.to_csv(filename, index=False)
    return submission








import joblib
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def fit_and_save_encoders(train_df, categorical_columns, file_path='encoders.joblib'):
    """
    Fits LabelEncoders for each categorical column in train_df and saves the encoders.
    
    :param train_df: DataFrame containing the training data.
    :param categorical_columns: List of column names that are categorical.
    :param file_path: File path to save the encoders.
    :return: DataFrame with encoded categorical columns.
    """
    encoders = {}
    for column in categorical_columns:
        le = LabelEncoder()
        # Fit the encoder and transform the data
        train_df[column] = le.fit_transform(train_df[column].astype(str))
        # Save the encoder in a dictionary
        encoders[column] = le
    # Save all encoders to disk
    joblib.dump(encoders, file_path)
    return train_df



def load_and_apply_encoders(test_df, categorical_columns, file_path='encoders.joblib'):
    """
    Loads encoders and applies them to the test_df, handling unseen categories.
    
    :param test_df: DataFrame containing the testing data.
    :param categorical_columns: List of column names that are categorical.
    :param file_path: File path where the encoders are saved.
    :return: DataFrame with encoded categorical columns.
    """
    # Load the saved encoders
    encoders = joblib.load(file_path)
    for column in categorical_columns:
        le = encoders[column]
        # Handle unseen categories by using 'transform' method and custom handling for unknown categories
        test_df[column] = test_df[column].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
    return test_df









train_df, test_df = load_data(Kaggle=False, debug=False)

test_ids = test_df['isic_id']


# Drop columns that are not needed and handle missing data
missin_in_test = ['iddx_3', 'iddx_full', 'iddx_2', 'mel_mitotic_index', 
                  'iddx_1', 'lesion_id', 'tbp_lv_dnn_lesion_confidence', 
                  'iddx_5', 'iddx_4', 'mel_thick_mm']
columns_to_drop = missin_in_test + ['isic_id', 'patient_id', 'sex', 'anatom_site_general', 
                    'image_type', 'tbp_tile_type', 'attribution', 'copyright_license']
test_df = preprocess_and_impute_data(test_df, columns_to_drop)
train_df = preprocess_and_impute_data(train_df, columns_to_drop)


# Label encode categorical columns
categorical_cols = [ 'tbp_lv_location', 'tbp_lv_location_simple']
# Fit encoders to the training data and save them
train_df = fit_and_save_encoders(train_df, categorical_cols)

# Load the encoders and apply them to the testing data
test_df = load_and_apply_encoders(test_df, categorical_cols)




# Feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)



# Train the model and make predictions
models = train_model(train_df, categorical_columns=categorical_cols)
catboost_predictions = predict(models, test_df)


Memory usage of dataframe is 168.29 MB
Memory usage after optimization is: 109.77
Decreased by 34.8%
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00
Decreased by 35.2%
Processing column: age_approx with KNN
0:	test: 0.7846844	best: 0.7846844 (0)	total: 145ms	remaining: 2m 25s
100:	test: 0.9473169	best: 0.9476601 (86)	total: 7.49s	remaining: 1m 6s
200:	test: 0.9517468	best: 0.9524908 (162)	total: 14.9s	remaining: 59.3s
300:	test: 0.9537625	best: 0.9540731 (268)	total: 22.6s	remaining: 52.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9540730814
bestIteration = 268

Shrink model to first 269 iterations.
0:	test: 0.7051942	best: 0.7051942 (0)	total: 79.5ms	remaining: 1m 19s
100:	test: 0.9400225	best: 0.9401430 (98)	total: 7.13s	remaining: 1m 3s
200:	test: 0.9467441	best: 0.9469202 (184)	total: 13.9s	remaining: 55.2s
300:	test: 0.9463501	best: 0.9473208 (250)	total: 20.7s	remaining: 48s
Stopped by overfitting detector  (100 iterations 

In [6]:
# Generate submission file
def create_submission(test_ids, predictions, filename='submission.csv'):
    submission = pd.DataFrame({
        'isic_id': test_ids,
        'target': predictions
    })
    submission.to_csv(filename, index=False)
    print(f'Submission saved to {filename}')
    return submission

# Assuming you have the test IDs ready
create_submission(test_ids, catboost_predictions)

Submission saved to submission.csv


Unnamed: 0,isic_id,target
0,ISIC_0015657,0.000205
1,ISIC_0015729,2.1e-05
2,ISIC_0015740,0.000179


In [3]:
############# Ensemble ####################
ensemble_predictions = (np.array(resnet_predictions) + np.array(catboost_predictions)) / 2  


# Generate submission file
def create_submission(test_ids, predictions, filename='submission.csv'):
    submission = pd.DataFrame({
        'isic_id': test_ids,
        'target': predictions
    })
    submission.to_csv(filename, index=False)
    print(f'Submission saved to {filename}')
    return submission

# Assuming you have the test IDs ready
create_submission(test_ids, ensemble_predictions)

Submission saved to submission.csv


Unnamed: 0,isic_id,target
0,ISIC_0015657,0.352896
1,ISIC_0015729,6e-06
2,ISIC_0015740,0.046168
