In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score

from tqdm import tqdm

import warnings; warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
from pathlib import Path
DATA_PATH = Path("C:/ML_Projects/kaggle-workflow/data/")
OUTPUT_PATH = Path("C:/ML_Projects/kaggle-workflow/output/")

# General settings
SEED = 42
N_FOLDS = 5

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [4]:
#Load Data
df = pd.read_csv(DATA_PATH / "Diabetes Prediction Challenge" / "train.csv")

In [5]:
df.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [6]:
df = df.rename(columns = ({"alcohol_consumption_per_week":"alc",'physical_activity_minutes_per_week':"activity", 'diet_score':"diet",
                          'sleep_hours_per_day': "sleep", 'screen_time_hours_per_day':"screen", 'education_level':"edu",
                           'income_level':"inc", 'smoking_status': "smoke", 'employment_status':"empl", 'family_history_diabetes': "fam_his",
                          'hypertension_history': "hyp_his", 'cardiovascular_history': "card_his", 'diagnosed_diabetes': "label"} ))

In [7]:
df.columns

Index(['id', 'age', 'alc', 'activity', 'diet', 'sleep', 'screen', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl',
       'fam_his', 'hyp_his', 'card_his', 'label'],
      dtype='object')

In [8]:
target_col = ['label']
use_cols = ['age', 'alc', 'activity', 'diet', 'sleep', 'screen', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl',
       'fam_his', 'hyp_his', 'card_his',]
X = df[use_cols].copy()
y=df[target_col].astype(int).copy()

In [9]:
cat_features =['gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl']

num_features = X.select_dtypes(include=["int64","float64"]).columns

In [10]:
test_cols = X.columns.tolist()
test_cols

['age',
 'alc',
 'activity',
 'diet',
 'sleep',
 'screen',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'diastolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'gender',
 'ethnicity',
 'edu',
 'inc',
 'smoke',
 'empl',
 'fam_his',
 'hyp_his',
 'card_his']

In [11]:
%%time
param_grid = {
    "iterations": [800, 1000, 1200],
    "depth": [5, 6],
    "learning_rate": [0.03, 0.05],
    "l2_leaf_reg": [3, 5, 7],
    "min_data_in_leaf": [50, 100, 200],
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for iterations in param_grid["iterations"]:
    for depth in param_grid["depth"]:
        for lr in param_grid["learning_rate"]:
            for l2 in param_grid["l2_leaf_reg"]:
                for min_leaf in param_grid["min_data_in_leaf"]:

                    oof = np.zeros(len(X))

                    for tr_idx, va_idx in kf.split(X, y):
                        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
                        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

                        model = CatBoostClassifier(
                            task_type = "GPU",
                            iterations=iterations,
                            depth=depth,
                            learning_rate=lr,
                            l2_leaf_reg=l2,
                            min_data_in_leaf=min_leaf,
                            loss_function="Logloss",
                            eval_metric="AUC",
                            random_seed=42,
                            verbose=False
                        )

                        model.fit(X_tr, y_tr, cat_features = cat_features)
                        oof[va_idx] = model.predict_proba(X_va)[:, 1]

                    cv_auc = roc_auc_score(y, oof)

                    results.append({
                        "iterations": iterations,
                        "depth": depth,
                        "learning_rate": lr,
                        "l2_leaf_reg": l2,
                        "min_data_in_leaf": min_leaf,
                        "cv_auc": cv_auc
                    })

results_df = pd.DataFrame(results).sort_values("cv_auc", ascending=False)
results_df.head(10)

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric perio

CPU times: total: 15h 44min 39s
Wall time: 13h 44min 24s


Unnamed: 0,iterations,depth,learning_rate,l2_leaf_reg,min_data_in_leaf,cv_auc
100,1200,6,0.05,3,100,0.725076
102,1200,6,0.05,5,50,0.725073
103,1200,6,0.05,5,100,0.72506
106,1200,6,0.05,7,100,0.725046
101,1200,6,0.05,3,200,0.725046
107,1200,6,0.05,7,200,0.725043
104,1200,6,0.05,5,200,0.725013
99,1200,6,0.05,3,50,0.725009
105,1200,6,0.05,7,50,0.724992
64,1000,6,0.05,3,100,0.724577


In [12]:
results_df.shape

(108, 6)