In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, accuracy_score

from tqdm import tqdm

import warnings; warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

from itertools import product


import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
from pathlib import Path
DATA_PATH = Path("C:/ML_Projects/kaggle-workflow/data/")
OUTPUT_PATH = Path("C:/ML_Projects/kaggle-workflow/output/")

# General settings
SEED = 42
N_FOLDS = 5

# Metric choice placeholder
#Adjust depending on the competition
METRIC = "auc"

In [4]:
#Load Data
df = pd.read_csv(DATA_PATH / "Diabetes Prediction Challenge" / "train.csv")

In [5]:
df.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [6]:
df = df.rename(columns = ({"alcohol_consumption_per_week":"alc",'physical_activity_minutes_per_week':"activity", 'diet_score':"diet",
                          'sleep_hours_per_day': "sleep", 'screen_time_hours_per_day':"screen", 'education_level':"edu",
                           'income_level':"inc", 'smoking_status': "smoke", 'employment_status':"empl", 'family_history_diabetes': "fam_his",
                          'hypertension_history': "hyp_his", 'cardiovascular_history': "card_his", 'diagnosed_diabetes': "label"} ))

In [7]:
df.columns

Index(['id', 'age', 'alc', 'activity', 'diet', 'sleep', 'screen', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl',
       'fam_his', 'hyp_his', 'card_his', 'label'],
      dtype='object')

In [8]:
target_col = ['label']
use_cols = ['age', 'activity', 'diet','screen', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl',
       'fam_his']
X = df[use_cols].copy()
y=df[target_col].astype(int).copy()

In [9]:
cat_features =['gender', 'ethnicity', 'edu', 'inc', 'smoke', 'empl']

num_features = X.select_dtypes(include=["int64","float64"]).columns

In [10]:
X = df[num_features]

In [11]:
final_num = X.columns.tolist()
final_num

['age',
 'activity',
 'diet',
 'screen',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'fam_his']

In [12]:
X["age*activity"] = X["age"] * X["activity"]
X.head()

Unnamed: 0,age,activity,diet,screen,bmi,waist_to_hip_ratio,systolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,fam_his,age*activity
0,31,45,7.7,6.1,33.4,0.93,112,62,199,58,114,102,0,1395
1,50,73,5.7,5.8,23.8,0.83,120,71,199,50,121,124,0,3650
2,32,158,8.5,9.1,24.1,0.83,95,73,188,59,114,108,0,5056
3,54,77,4.6,9.2,26.6,0.83,121,74,182,54,85,123,0,4158
4,54,55,5.7,5.1,28.8,0.9,108,85,206,49,131,124,0,2970


In [13]:
final_cols = X.columns.tolist()
final_cols

['age',
 'activity',
 'diet',
 'screen',
 'bmi',
 'waist_to_hip_ratio',
 'systolic_bp',
 'heart_rate',
 'cholesterol_total',
 'hdl_cholesterol',
 'ldl_cholesterol',
 'triglycerides',
 'fam_his',
 'age*activity']

In [14]:
model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.03,
        num_leaves=31,
        min_data_in_leaf = 60,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8, 
        random_state=42,
        n_jobs=-1,
        verbosity = -1
    )

In [15]:
param_grid = {
    "lgbm__min_data_in_leaf": [20, 40, 80],
    "lgbm__lambda_l1": [0.0, 0.1, 1.0],
    "lgbm__lambda_l2": [0.0, 1.0, 5.0],
    "lgbm__feature_fraction": [0.6, 0.7, 0.8],
    "lgbm__bagging_fraction": [0.6, 0.7, 0.8],
}

In [16]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_auc = 0
best_params = None

for values in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), values))

    oof = np.zeros(len(X))

    for tr_idx, va_idx in kf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.03,
        num_leaves=31,
        min_data_in_leaf = 60,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8, 
        random_state=42,
        n_jobs=-1,
        verbosity = -1
    )
        model.fit(X_tr, y_tr)
        oof[va_idx] = model.predict_proba(X_va)[:, 1]

    auc = roc_auc_score(y, oof)

    if auc > best_auc:
        best_auc = auc
        best_params = params

print("Best CV AUC:", best_auc)
print("Best params:", best_params)

Best CV AUC: 0.7246356924111362
Best params: {'lgbm__min_data_in_leaf': 20, 'lgbm__lambda_l1': 0.0, 'lgbm__lambda_l2': 0.0, 'lgbm__feature_fraction': 0.6, 'lgbm__bagging_fraction': 0.6}


In [18]:
df_test = df_test.rename(columns = ({"alcohol_consumption_per_week":"alc",'physical_activity_minutes_per_week':"activity", 'diet_score':"diet",
                          'sleep_hours_per_day': "sleep", 'screen_time_hours_per_day':"screen", 'education_level':"edu",
                           'income_level':"inc", 'smoking_status': "smoke", 'employment_status':"empl", 'family_history_diabetes': "fam_his",
                          'hypertension_history': "hyp_his", 'cardiovascular_history': "card_his", 'diagnosed_diabetes': "label"} ))