In [19]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold, cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, matthews_corrcoef, cohen_kappa_score, balanced_accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import optuna
from optuna.integration import XGBoostPruningCallback, LightGBMPruningCallback, CatBoostPruningCallback
from lightgbm import early_stopping, log_evaluation

In [2]:
root = Path(r'D:\PATH\Python\4Geeks\Proyecto_Final_Cancer')

train_path = root / 'data/raw/train-metadata.csv'
test_path = root / 'data/raw/test-metadata.csv'
subm_path = root / 'data/raw/sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',                 # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',                # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                      # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',                # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',           # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',                 # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',                  # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',              # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',           # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',           # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',           # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',       # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',       # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',                 # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',                 # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',              # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',         # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',             # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',            # border_complexity       + lesion_shape_index
    'color_contrast_index',              # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',                   # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',            # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',               # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',                  # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',       # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',             # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',          # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',    # tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',        # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',              # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',          # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',         # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',    # tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',             # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',           # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                       # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',           # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',               # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',           # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',           # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
norm_cols2 = [f'{col}_sex_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + norm_cols2 + special_cols

In [3]:
def read_data(path):
    df = pd.read_csv(path)
    
    df['sex'] = df['sex'].fillna('Unknown')
    df['sex'] = df['sex'].apply(lambda x: 'male' if x not in ['male', 'female'] else x)
    
    df_polars = pl.from_pandas(df)
    
    # Drop rows where 'age_approx' is missing (NaN)
    df_polars = df_polars.filter(~pl.col('age_approx').is_null())
    
    # Drop rows where 'age_approx' is missing (NaN)
    df_polars = df_polars.filter(~pl.col('anatom_site_general').is_null())
    

    df_polars = df_polars.with_columns(
        pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
    )
    df_polars = df_polars.with_columns(
        lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
        lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
        hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
        luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
        lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
        border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
        color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
    )
    df_polars = df_polars.with_columns(
        position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
        perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
        area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
        lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
        combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
        symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
        consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
    )
    df_polars = df_polars.with_columns(
        color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
        consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
        size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
        hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
        lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
        shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
        color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
    )
    df_polars = df_polars.with_columns(
        log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
        normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
        mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
        std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
        color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
        lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
        overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
    )
    df_polars = df_polars.with_columns(
        symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
        comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
        color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
        border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
        border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
        size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
        age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
        age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
        color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
    )
    df_polars = df_polars.with_columns(
        volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
        color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
        shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
        border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
        age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
        index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
    )
    df_polars = df_polars.with_columns(
        ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
    )
    df_polars = df_polars.with_columns(
        ((pl.col(col) - pl.col(col).mean().over('sex')) / (pl.col(col).std().over('sex') + err)).alias(f'{col}_sex_norm') for col in (num_cols + new_num_cols)
    )
    df_polars = df_polars.with_columns(
        count_per_patient = pl.col('isic_id').count().over('patient_id'),
    )
    df_polars = df_polars.with_columns(
        pl.col(cat_cols).cast(pl.Categorical),
    )
    
    df = df_polars.to_pandas()
    df.set_index(id_col)
    
    return df

In [4]:
def preprocess(df_train, df_test):
    global cat_cols
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype(np.int32)

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype(np.int32)

    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

In [5]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

### Data Read & Feature Engineering

In [6]:
df_train = read_data(train_path)
df_test = read_data(test_path)
df_subm = pd.read_csv(subm_path, index_col=id_col)

  df = pd.read_csv(path)


In [10]:
# Filter out rows where "count_per_patient" is equal to 1
df_train = df_train[df_train['count_per_patient']!=1]

In [11]:
df_train, df_test = preprocess(df_train, df_test)

In [None]:
"""
# Save data
df_train.to_csv('../data/processed/df_train_processed.csv')
df_test.to_csv('../data/processed/df_test_processed.csv')
"""

In [None]:
"""
df_train = pd.read_csv("../data/processed/df_train_processed.csv")
df_test = pd.read_csv("../data/processed/df_test_processed.csv")
"""

In [12]:
# Load data
X_train, X_test, y_train, y_test = train_test_split(df_train[feature_cols], df_train[target_col], test_size=0.3, random_state=42, stratify=df_train[target_col])


### Optuna HyperParam Tuning

In [None]:
def objective_xgb(trial):

    # Suggest hyperparameters
    params = {
        'tree_method': 'hist',
        'random_state': seed,
        'eval_metric': 'auc',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'lambda': trial.suggest_float('lambda', 1.0, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 0.1, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0),
    }

    # Create DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train the model with early stopping
    bst = xgb.train(
        params,
        dtrain,
        evals=[(dtrain, 'train'), (dtest, 'validation')],
        num_boost_round=1000,  # Set a large number for rounds
        early_stopping_rounds=50,  # Stop if no improvement in 50 rounds
        callbacks=[XGBoostPruningCallback(trial, 'validation-auc')],
    )

    # Predict and evaluate
    preds = bst.predict(dtest)
    roc_auc = roc_auc_score(y_test, preds)
    
    return roc_auc

In [None]:
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100, show_progress_bar=True)

print(f'Best trial for XGBoost: {study_xgb.best_trial.value}')
print(f'Best hyperparameters for XGBoost: {study_xgb.best_trial.params}')

Best trial for XGBoost: 0.9546400067683803

Best hyperparameters for XGBoost: {'learning_rate': 0.012565783218797217, 'lambda': 2.3531128643047112, 'alpha': 9.986200510029958, 'max_depth': 8, 'subsample': 0.5145937607178193, 'colsample_bytree': 0.5224126651074614, 'colsample_bylevel': 0.6554410069169192, 'colsample_bynode': 0.6131241207242786, 'scale_pos_weight': 7.381848408688062}

In [None]:
def objective_lgb(trial):

    # Suggest hyperparameters
    params = {
        'objective': 'binary',  # Fixed parameter
        'verbosity': -1,  # Fixed parameter
        'n_iter': 200,  # Fixed parameter
        'boosting_type': 'gbdt',  # Fixed parameter
        'random_state': seed,  # Fixed parameter, assuming 'seed' is predefined
        'lambda_l1': trial.suggest_float('lambda_l1', 0.001, 0.1, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.001, 0.1, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),  # Wider range for better exploration
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),  # Typical range for LightGBM
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.2, 0.8),  # Adjusted to ensure variability
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),  # Typical range for bagging frequency
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),  # Adjusted range to cover more options
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 5.0),  # Range based on class imbalance
    }
    
    # Create and train the model
    model = lgb.LGBMClassifier(**params)
    
    # Train the model with early stopping
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],  # Validation set for early stopping
        eval_metric='auc',  # Evaluation metric
        callbacks=[LightGBMPruningCallback(trial, 'auc'),
                   early_stopping(stopping_rounds=50),
                   log_evaluation(100)]  # Pruning callback
    )
    
    # Predict and evaluate
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)

    return roc_auc

In [None]:
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=100, show_progress_bar=True)

print(f'Best trial for LightGBM: {study_lgb.best_trial.value}')
print(f'Best hyperparameters for LightGBM: {study_lgb.best_trial.params}')

Best trial for LightGBM: 0.9593741363264615

Best hyperparameters for LightGBM: {'lambda_l1': 0.014902669085932872, 'lambda_l2': 0.03957395263238096, 'learning_rate': 0.014993253193227397, 'max_depth': 7, 'num_leaves': 34, 'colsample_bytree': 0.6657330406644436, 'colsample_bynode': 0.6558724487912035, 'bagging_fraction': 0.5630352613173868, 'bagging_freq': 5, 'min_data_in_leaf': 20, 'scale_pos_weight': 2.4365960001172486}

In [None]:
def objective_cb(trial):

# Suggest hyperparameters
    params = {
        'loss_function': 'Logloss',  # Binary classification loss function
        'iterations': 250,  # Fixed number of boosting rounds
        'eval_metric': 'AUC',
        'verbose': False,  # Suppress output
        'random_state': seed,  # Fixed random state
        'max_depth': trial.suggest_int('max_depth', 4, 10),  # Depth of the trees
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),  # Learning rate
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 5.0),  # Class imbalance
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0, log=True),  # L2 regularization
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsampling ratio
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),  # Minimum data in a leaf
        'cat_features': cat_cols,  # Categorical features
        'early_stopping_rounds': 50,  # Early stopping
    }

    # Create and train the model
    model = cb.CatBoostClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        use_best_model=True,  # Use the best model obtained during training
        callbacks=[CatBoostPruningCallback(trial, 'AUC')]  # Pruning callback on 'AUC'
    )

    # Predict and evaluate
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)

    return roc_auc

In [None]:
study_catb = optuna.create_study(direction='maximize')
study_catb.optimize(objective_cb, n_trials=100, show_progress_bar=True)

print(f'Best trial for CatBoost: {study_catb.best_trial.value}')
print(f'Best hyperparameters for CatBoost: {study_catb.best_trial.params}')

Best trial for CatBoost: 0.9545530753828365


Best hyperparameters for CatBoost: {'max_depth': 8, 'learning_rate': 0.05554948396494686, 'scale_pos_weight': 4.153119343042651, 'l2_leaf_reg': 2.9540259337379835, 'subsample': 0.5331928637659862, 'min_data_in_leaf': 50}

### Optuna HyperParam Tuned Models

In [13]:
lgb_params= {
    'lambda_l1': 0.014902669085932872, 
    'lambda_l2': 0.03957395263238096,
    'learning_rate': 0.014993253193227397, 
    'max_depth': 7, 
    'num_leaves': 34, 
    'colsample_bytree': 0.6657330406644436,
    'colsample_bynode': 0.6558724487912035, 
    'bagging_fraction': 0.5630352613173868, 
    'bagging_freq': 5, 
    'min_data_in_leaf': 20, 
    'scale_pos_weight': 2.4365960001172486,
    'objective': 'binary',  # Fixed parameter
    'verbosity': -1,  # Fixed parameter
    'boosting_type': 'gbdt',  # Fixed parameter
}

lgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

In [None]:
"""
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           250,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.08758718919397321, 
    'lambda_l2':        0.0039689175176025465, 
    'learning_rate':    0.03231007103195577, 
    'max_depth':        4, 
    'num_leaves':       103, 
    'colsample_bytree': 0.8329551585827726, 
    'colsample_bynode': 0.4025961355653304, 
    'bagging_fraction': 0.7738954452473223, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 85, 
    'scale_pos_weight': 2.7984184778875543,
}

lgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])
"""

In [14]:
cb_params = {
    'max_depth': 8, 
    'learning_rate': 0.05554948396494686, 
    'scale_pos_weight': 4.153119343042651, 
    'l2_leaf_reg': 2.9540259337379835, 
    'subsample': 0.5331928637659862, 
    'min_data_in_leaf': 50,
    'loss_function': 'Logloss',  # Binary classification loss function
    'iterations': 250,  # Fixed number of boosting rounds
    'eval_metric': 'AUC',
    'verbose': False,  # Suppress output
    'random_state': seed,  # Fixed random state
    'cat_features': cat_cols,
}

cb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

In [None]:
"""
cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7, 
    'learning_rate':     0.06936242010150652, 
    'scale_pos_weight':  2.6149345838209532, 
    'l2_leaf_reg':       6.216113851699493, 
    'subsample':         0.6249261779711819, 
    'min_data_in_leaf':  24,
    'cat_features':      cat_cols,
}

cb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])
"""

In [15]:
xgb_params = {
    'learning_rate': 0.012565783218797217, 
    'lambda': 2.3531128643047112, 
    'alpha': 9.986200510029958, 
    'max_depth': 8, 
    'subsample': 0.5145937607178193, 
    'colsample_bytree': 0.5224126651074614, 
    'colsample_bylevel': 0.6554410069169192, 
    'colsample_bynode': 0.6131241207242786, 
    'scale_pos_weight': 7.381848408688062,
    'tree_method': 'hist',
    'random_state': seed,
    'eval_metric': 'auc',
}

xgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

In [None]:
"""
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'learning_rate':      0.08501257473292347, 
    'lambda':             8.879624125465703, 
    'alpha':              0.6779926606782505, 
    'max_depth':          6, 
    'subsample':          0.6012681388711075, 
    'colsample_bytree':   0.8437772277074493, 
    'colsample_bylevel':  0.5476090898823716, 
    'colsample_bynode':   0.9928601203635129, 
    'scale_pos_weight':   3.29440313334688,
}

xgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])
"""

In [16]:
estimator = VotingClassifier([
    ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
], voting='soft', n_jobs=3)

### Cross Validation

In [17]:
X = df_train[feature_cols]
y = df_train[target_col]
groups = df_train[group_col]
cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

val_score = cross_val_score(
    estimator=estimator, 
    X=X, y=y, 
    cv=cv, 
    groups=groups,
    scoring=custom_metric,
)

np.mean(val_score), val_score

(0.169151551814075,
 array([0.1830901 , 0.16099679, 0.16776428, 0.17404824, 0.15985835]))

In [None]:
# (0.169151551814075, array([0.1830901 , 0.16099679, 0.16776428, 0.17404824, 0.15985835]))

### Training

In [18]:
X, y = df_train[feature_cols], df_train[target_col]

estimator.fit(X, y)

In [56]:
DO_FEATURE_IMPORTANCE_MODELS = True

In [57]:
if DO_FEATURE_IMPORTANCE_MODELS:
    # LightGBM feature importance
    lgb_model = estimator.named_estimators_['lgb'].named_steps['classifier']
    lgb_feature_importance = lgb_model.booster_.feature_importance(importance_type='gain')
    lgb_feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': lgb_feature_importance
    }).sort_values(by='importance', ascending=False)

    # XGBoost feature importance
    xgb_model = estimator.named_estimators_['xgb'].named_steps['classifier']
    xgb_feature_importance = xgb_model.get_booster().get_score(importance_type='weight')
    xgb_feature_importance_df = pd.DataFrame({
        'feature': list(xgb_feature_importance.keys()),
        'importance': list(xgb_feature_importance.values())
    }).sort_values(by='importance', ascending=False)

    # CatBoost feature importance
    cat_model = estimator.named_estimators_['cb'].named_steps['classifier']
    cat_feature_importance = cat_model.get_feature_importance()
    cat_feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': cat_feature_importance
    }).sort_values(by='importance', ascending=False)

    # Optionally, combine all feature importances into one DataFrame for comparison
    all_feature_importance_df = pd.concat([
        lgb_feature_importance_df.assign(model='LightGBM'),
        xgb_feature_importance_df.assign(model='XGBoost'),
        cat_feature_importance_df.assign(model='CatBoost')
    ])


In [58]:
if DO_FEATURE_IMPORTANCE_MODELS:

    # Assuming lgb_feature_importance_df is already created and contains the feature importances
    least_important_lgb = lgb_feature_importance_df.sort_values(by='importance').head(24)

    print("Least Important Features in LightGBM:")
    print(least_important_lgb)

    # Convert the xgb_feature_importance to a DataFrame for easier manipulation
    least_important_xgb = xgb_feature_importance_df.sort_values(by = "importance").head(6)


    print("\nLeast Important Features in XGBoost:")
    print(least_important_xgb)

Least Important Features in LightGBM:
       feature  importance
233   onehot_4         0.0
246  onehot_17         0.0
240  onehot_11         0.0
244  onehot_15         0.0
269  onehot_40         0.0
238   onehot_9         0.0
267  onehot_38         0.0
266  onehot_37         0.0
229   onehot_0         0.0
264  onehot_35         0.0
239  onehot_10         0.0
262  onehot_33         0.0
249  onehot_20         0.0
259  onehot_30         0.0
250  onehot_21         0.0
251  onehot_22         0.0
252  onehot_23         0.0
253  onehot_24         0.0
234   onehot_5         0.0
245  onehot_16         0.0
256  onehot_27         0.0
248  onehot_19         0.0
255  onehot_26         0.0
243  onehot_14         0.0

Least Important Features in XGBoost:
       feature  importance
233   onehot_5         1.0
256  onehot_34         1.0
261  onehot_40         1.0
238  onehot_11         1.0
255  onehot_33         1.0
243  onehot_18         1.0


In [59]:
if DO_FEATURE_IMPORTANCE_MODELS:

    # Extract the least important feature names from both LightGBM and XGBoost
    least_important_lgb_features = least_important_lgb['feature'].tolist()
    least_important_xgb_features = least_important_xgb['feature'].tolist()

    # Find the intersection of the two lists
    common_least_important_features = list(set(least_important_lgb_features) & set(least_important_xgb_features))

    print("Common Least Important Features in Both LightGBM and XGBoost:")
    print(common_least_important_features)

Common Least Important Features in Both LightGBM and XGBoost:
['onehot_5', 'onehot_33', 'onehot_40', 'onehot_11']


### Prediction

In [None]:
df_subm['target'] = estimator.predict_proba(df_test[feature_cols])[:, 1]

df_subm.to_csv('submission.csv')
df_subm.head()

In [None]:
def evaluate_model(estimator, X, y_true):
    # Generate predictions
    y_pred = estimator.predict(X)
    y_prob = estimator.predict_proba(X)[:, 1]

    # Calculate common evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    logloss = log_loss(y_true, y_prob)

    # Calculate the custom metric (partial AUC)
    partial_auc = custom_metric(estimator, X, y_true)

    # Print or return all metrics
    results = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'Log Loss': logloss,
        'Custom Metric (Partial AUC)': partial_auc
    }
    
    return results

In [None]:
metrics = evaluate_model(estimator, X, y)
print(metrics)

In [20]:
def advanced_evaluate_model(estimator, X, y_true):
    # Generate predictions
    y_pred = estimator.predict(X)
    y_prob = estimator.predict_proba(X)[:, 1]

    # Calculate basic evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    logloss = log_loss(y_true, y_prob)

    # Advanced metrics
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)

    # Specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # Custom metric (Partial AUC)
    partial_auc = custom_metric(estimator, X, y_true)

    # Compile results
    results = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'Log Loss': logloss,
        'MCC': mcc,
        'Cohen\'s Kappa': kappa,
        'Balanced Accuracy': balanced_acc,
        'Specificity': specificity,
        'Custom Metric (Partial AUC)': partial_auc
    }
    
    return results

In [21]:
metrics = advanced_evaluate_model(estimator, X, y)
metrics

{'Accuracy': 0.9992331444643731,
 'Precision': 0.5808823529411765,
 'Recall': 0.8123393316195373,
 'F1-Score': 0.677384780278671,
 'ROC-AUC': 0.9984484630684877,
 'Log Loss': 0.024890335248618955,
 'MCC': 0.6865745747069969,
 "Cohen's Kappa": 0.6770115035038202,
 'Balanced Accuracy': 0.9058789407056559,
 'Specificity': 0.9994185497917745,
 'Custom Metric (Partial AUC)': 0.19853751098913813}

In [22]:
import joblib

# Assuming 'estimator' is your trained model
model_filename = 'estimator_model.pkl'

# Save the model to a file
joblib.dump(estimator, model_filename)

print(f"Model saved to {model_filename}")

Model saved to estimator_model.pkl


In [None]:
# Assuming you have already trained models (LightGBM, XGBoost, CatBoost)
lgb_model = estimator.named_estimators_['lgb'].named_steps['classifier']
xgb_model = estimator.named_estimators_['xgb'].named_steps['classifier']
cat_model = estimator.named_estimators_['cb'].named_steps['classifier']

# LightGBM feature importance
lgb_feature_importance = lgb_model.booster_.feature_importance(importance_type='gain')
lgb_feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': lgb_feature_importance
}).sort_values(by='importance', ascending=False)

# XGBoost feature importance
xgb_feature_importance = xgb_model.get_booster().get_score(importance_type='weight')
xgb_feature_importance_df = pd.DataFrame({
    'feature': list(xgb_feature_importance.keys()),
    'importance': list(xgb_feature_importance.values())
}).sort_values(by='importance', ascending=False)

# CatBoost feature importance
cat_feature_importance = cat_model.get_feature_importance()
cat_feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': cat_feature_importance
}).sort_values(by='importance', ascending=False)

# Combine feature importances from all models
combined_importance_df = pd.concat([
    lgb_feature_importance_df.assign(model='LightGBM'),
    xgb_feature_importance_df.assign(model='XGBoost'),
    cat_feature_importance_df.assign(model='CatBoost')
])

# Average importance across models
average_importance = combined_importance_df.groupby('feature')['importance'].mean().sort_values(ascending=False)
average_importance

In [26]:
# Assume average_importance is a DataFrame or Series with feature importances
low_importance_threshold = 0.01
N = 50

# Step 1: Filter out low-importance features
filtered_features = average_importance[average_importance > low_importance_threshold]

# Step 2: Select the top N features from the remaining ones
top_features = filtered_features.head(N).index.tolist()

# Step 3: Subset the original dataset to include only these top features
X_final = X[top_features]

In [27]:
X_final

Unnamed: 0,tbp_lv_H_patient_norm,age_normalized_nevi_confidence_2_patient_norm,tbp_lv_H,tbp_lv_H_sex_norm,lesion_visibility_score_patient_norm,clin_size_long_diam_mm,normalized_lesion_size_sex_norm,area_to_perimeter_ratio_patient_norm,clin_size_long_diam_mm_patient_norm,clin_size_long_diam_mm_sex_norm,...,tbp_lv_deltaA_patient_norm,lesion_color_difference_sex_norm,mean_hue_difference_patient_norm,tbp_lv_y_sex_norm,tbp_lv_radial_color_std_max_patient_norm,color_asymmetry_index_patient_norm,area_to_perimeter_ratio_sex_norm,perimeter_to_area_ratio_patient_norm,tbp_lv_y,area_to_perimeter_ratio
0,-0.007955,-0.437904,53.058545,-0.257539,-1.108037,3.04,-0.513721,-1.245546,-0.463758,-0.517573,...,0.221302,-0.353467,-0.264082,-1.141557,-1.340771,-1.262331,-1.190474,1.858472,613.493652,0.338730
1,-1.484982,-1.073923,39.702910,-2.642178,-1.302648,1.10,-1.389075,-1.655007,-1.843134,-1.619984,...,0.793910,-0.320098,-1.631592,1.201646,-1.334786,-1.287635,-1.463614,2.580404,1575.687000,0.274137
2,0.921925,-0.208326,59.265850,0.850771,-0.853308,3.40,-0.351284,-0.916013,-0.153907,-0.313001,...,0.348912,-0.017180,0.907287,0.949164,-1.183910,-1.091359,-1.069085,1.143492,1472.010000,0.367436
3,0.063702,-0.306571,56.414429,0.341652,-0.771365,3.22,-0.544265,0.460098,-0.296650,-0.415287,...,-0.974540,-1.372518,-0.348910,0.876534,-0.497979,-0.572848,0.079338,-0.622605,1442.185791,0.639018
4,-0.997008,-0.214058,46.946070,-1.348917,-1.548093,2.73,-0.541614,-1.542686,-0.684599,-0.693731,...,-0.292591,-0.183595,-1.356232,0.989857,-2.051605,-1.632357,-1.248697,3.191656,1488.720000,0.324961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392515,0.087913,1.189629,50.784168,-0.663627,2.974528,6.80,0.744519,2.336238,1.630003,1.619061,...,1.254227,2.069488,0.453788,0.243684,2.563961,2.504056,2.167107,-1.743126,1182.317505,1.132739
392516,0.900508,-0.326926,59.678970,0.924533,-0.240953,3.11,-0.482136,-0.370461,-0.400930,-0.477795,...,-0.427486,-0.276911,0.617937,0.757209,0.204464,1.120589,-0.456449,0.132754,1393.187000,0.512314
392517,1.328509,-0.484384,59.854275,0.923974,-0.040892,2.05,-0.929175,-0.321380,-0.868608,-1.077172,...,-0.005262,-0.188909,1.606398,0.314262,-0.310637,-0.538962,-0.755364,0.052555,1065.263672,0.459759
392518,-0.875221,-0.715443,51.905420,-0.573870,-0.851844,2.80,0.259005,-0.675730,-0.965329,-0.637663,...,2.342035,1.458617,-0.051227,-0.167897,-0.793021,-0.755709,0.082446,0.465499,877.527000,0.660975


In [None]:
# Retrain the model with the selected features
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

# Assuming you're using a pipeline or an estimator object
estimator.fit(X_train, y_train)
metrics = advanced_evaluate_model(estimator, X_test, y_test)

# Evaluate the performance with reduced feature set
print(metrics)

In [43]:
lgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('feature_selection', SelectFromModel(lgb.LGBMClassifier(**lgb_params), threshold=0.01)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

xgb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('feature_selection', SelectFromModel(xgb.XGBClassifier(**xgb_params), threshold=0.01)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

cb_model = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

In [44]:
estimator = VotingClassifier([
    ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
], voting='soft', n_jobs=3)

In [45]:
X = df_train[feature_cols]
y = df_train[target_col]
groups = df_train[group_col]
cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

val_score = cross_val_score(
    estimator=estimator, 
    X=X, y=y, 
    cv=cv, 
    groups=groups,
    scoring=custom_metric,
)

np.mean(val_score), val_score

(0.16410241759935876,
 array([0.17900464, 0.15350768, 0.16350428, 0.17077197, 0.15372352]))

In [9]:
# (0.16410241759935876, array([0.17900464, 0.15350768, 0.16350428, 0.17077197, 0.15372352]))

In [46]:
X, y = df_train[feature_cols], df_train[target_col]

estimator.fit(X, y)

In [47]:
metrics = advanced_evaluate_model(estimator, X, y)
metrics

{'Accuracy': 0.999115950595141,
 'Precision': 0.5384615384615384,
 'Recall': 0.7557840616966581,
 'F1-Score': 0.6288770053475936,
 'ROC-AUC': 0.9979538317966661,
 'Log Loss': 0.027378266693366925,
 'MCC': 0.6375198673435741,
 "Cohen's Kappa": 0.6284469458673891,
 'Balanced Accuracy': 0.8775707031016782,
 'Specificity': 0.9993573445066981,
 'Custom Metric (Partial AUC)': 0.1980916459591298}