In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from xgboost import XGBClassifier
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce

In [3]:
train_meta_or = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv',low_memory=False)
test_meta_or = pd.read_csv('/kaggle/input/isic-2024-challenge/test-metadata.csv',low_memory=False)

train_meta = train_meta_or.drop('isic_id', axis=1)
test_meta = test_meta_or.drop('isic_id', axis=1)

y_train_df = train_meta['target']

diff_train_test = set(train_meta.columns) - set(test_meta.columns)

x_train_df = train_meta.drop(columns=diff_train_test)

In [4]:
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple",'patient_id',
   'anatom_site_general','copyright_license','attribution','image_type']



encoder = ce.OrdinalEncoder(cols=cat_cols)
x_train_df = encoder.fit_transform(x_train_df)
x_test_df = encoder.transform(test_meta)

In [5]:
def calc_auc(solution: pd.DataFrame, submission: pd.DataFrame, min_tpr: float=0.80):
    '''
    Function to calculate tpr from auc
    solution : target
    submission : predict 
    '''
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

In [6]:
def train_model(main_model,X,y,X_test,n_splits):
    
    skf = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=42)
    train_scores = []
    val_scores = []
    test_predictions = [] 
    models = []
    
    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        
        x_train, x_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        
        model = main_model
        
        model.fit(x_train,y_train)
        
        y_train_pred_proba = model.predict_proba(x_train)[:, 1]
        y_train_pred_df = pd.DataFrame(y_train_pred_proba, columns=['pred'])
        train_pauc = calc_auc(y_train,y_train_pred_df,min_tpr=0.8)
        train_scores.append(train_pauc)
                                 
        y_val_pred_proba = model.predict_proba(x_val)[:, 1]
        y_val_pred_df = pd.DataFrame(y_val_pred_proba, columns=['pred'])
        val_pauc = calc_auc(y_val,y_val_pred_df,min_tpr=0.8)
        val_scores.append(val_pauc)
                                       
        y_test_pred_proba = model.predict_proba(X_test)[:, 1]
        test_predictions.append(y_test_pred_proba)
        
        models.append(model)
        
        print(f"Fold {fold}: Train pAUC = {train_pauc:.4f}, Validation pAUC = {val_pauc:.4f}")
                                       
    mean_train_pauc = np.mean(train_scores)
    mean_val_pauc = np.mean(val_scores)         
    
    print(f"\nMean Train pAUC: {mean_train_pauc:.4f}")
    print(f"Mean Validation pAUC: {mean_val_pauc:.4f}")
                                 
    return model,test_predictions,models

    

In [7]:
# params = {
#     'objective': 'binary:logistic', 
#     'max_depth': 4 ,
#     'learning_rate': 0.09,
#     'n_estimators': 1400,
#     'device' : "cuda"  # Use GPU
#               }

params = {
    'objective': 'binary:logistic', 
#     'colsample_bytree': 0.11756728710020253,
    'max_depth': 4, 
    'learning_rate': 0.009393224320850784,
    'n_estimators': 1227, 
#     'subsample': 0.9589462514195692,
    'lambda': 0.34216652262461505,
    'alpha': 1.150597512455824e-07,
    'device' : "cuda"
              }

xgb_Model = XGBClassifier(**params,random_state=42)


In [8]:
k_folds = 5
trained_xgb,test_preds,all_models = train_model(xgb_Model,x_train_df,y_train_df,x_test_df,k_folds)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fold 1: Train pAUC = 0.1938, Validation pAUC = 0.1680
Fold 2: Train pAUC = 0.1919, Validation pAUC = 0.1557
Fold 3: Train pAUC = 0.1911, Validation pAUC = 0.1592
Fold 4: Train pAUC = 0.1912, Validation pAUC = 0.1526
Fold 5: Train pAUC = 0.1926, Validation pAUC = 0.1731

Mean Train pAUC: 0.1921
Mean Validation pAUC: 0.1617


In [9]:
# xgb_Model.load_model('/kaggle/working/model_xgboost.json')

In [10]:
# # Assume `model` is your trained XGBoost model
# model_path = '/kaggle/working/model_xgboost.json'  # You can choose a different path or filename

# # Save the model
# trained_xgb.save_model(model_path)

In [11]:
# test_ouputs = xgb_Model.predict_proba(x_test_df)[:, 1]

In [12]:
# test_preds[i]

In [13]:
# test_outputs = np.zeros(len(x_test_df))

# for i in range(k_folds):
    
#     test_outputs[i] = test_preds[i].sum()

In [14]:
test_outputs = np.zeros(len(x_test_df))

In [15]:
for i in range(k_folds) :
    test_outputs += test_preds[i]

test_outputs /= k_folds

In [16]:
df_sample_submission = pd.DataFrame({'isic_id' : test_meta_or['isic_id'],'target' : test_outputs}) 

In [17]:
df_sample_submission

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.000197
1,ISIC_0015729,7.2e-05
2,ISIC_0015740,0.000297


In [18]:
df_sample_submission.to_csv('/kaggle/working/submission.csv',index=False)