# Preparation

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


In [14]:
# Import primary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import secondary libraries
import catboost
import colorama
import lightgbm
import scipy
import sklearn
import xgboost

# Import tertiary libraries
from catboost import CatBoostClassifier
from colorama import Fore, Style
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.stats import chi2_contingency
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler

# Generate warnings
import warnings
warnings.filterwarnings("ignore")

# Early analysis

In [4]:
# Load the dataset
train_df = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
sub = pd.read_csv('/kaggle/input/playground-series-s5e12/sample_submission.csv')

In [5]:
# Print the shape of the dataframe, including number of rows and columns
print(Fore.BLUE + "train_df shape: " + Style.RESET_ALL)
print(f"{train_df.shape}\n")

# Print the information about the dataframe, including column names, data types and non-null values
print(Fore.GREEN + "train_df info: " + Style.RESET_ALL)
print(f"{train_df.info}\n")

# Print the available missing values (NaN) in each column
print(Fore.YELLOW + "train_df isnull sum: " + Style.RESET_ALL)
print(f"{train_df.isnull().sum()}\n")

# Print the summary statistics for numerical columns (count, mean, standard, minimum, maximum, etc.)
print(Fore.RED + "train_df describe: " + Style.RESET_ALL)
print(f"{train_df.describe}\n")

[34mtrain_df shape: [0m
(700000, 26)

[32mtrain_df info: [0m
<bound method DataFrame.info of             id  age  alcohol_consumption_per_week  \
0            0   31                             1   
1            1   50                             2   
2            2   32                             3   
3            3   54                             3   
4            4   54                             1   
...        ...  ...                           ...   
699995  699995   29                             1   
699996  699996   46                             2   
699997  699997   35                             1   
699998  699998   49                             2   
699999  699999   42                             4   

        physical_activity_minutes_per_week  diet_score  sleep_hours_per_day  \
0                                       45         7.7                  6.8   
1                                       73         5.7                  6.5   
2                            

In [8]:
# Define numerical and categorical features
numerical_col = train_df.select_dtypes(include = ["int64", "float64"]).columns
categorical_col = train_df.select_dtypes(include = "object").columns

# Display the summary
print(f"Features available: {len(numerical_col)} numerical features {numerical_col}")
print("-"*100)
print(f"Features available: {len(categorical_col)} categorical features {categorical_col}")

Features available: 20 numerical features Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Features available: 6 categorical features Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status'],
      dtype='object')


In [9]:
# Define chi2_test function
chi2_test = []

for feature in categorical_col:
    if chi2_contingency(pd.crosstab(train_df['diagnosed_diabetes'], train_df[feature]))[1] < 0.05:
        chi2_test.append('Reject Null Hypotesis')
    else:
        chi2_test.append('Fail to Reject Null Hypotesis')

# Create a DataFrame to store the chi-squared test results
result = pd.DataFrame(data=[categorical_col, chi2_test]).T
result.columns = ['Column', 'Hypothesis Result']
result

Unnamed: 0,Column,Hypothesis Result
0,gender,Reject Null Hypotesis
1,ethnicity,Reject Null Hypotesis
2,education_level,Reject Null Hypotesis
3,income_level,Reject Null Hypotesis
4,smoking_status,Fail to Reject Null Hypotesis
5,employment_status,Reject Null Hypotesis


# Preprocessing

In [10]:
# Define preprocessing parameters
TARGET = "diagnosed_diabetes"
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

test_ids = test_df["id"]

cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

X[cat_cols] = enc.fit_transform(X[cat_cols])
test_df[cat_cols] = enc.fit_transform(test_df[cat_cols])

# Stratified K-Fold

In [12]:
# Perform Stratified K-Fold
KF = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
oof_xgb = np.zeros(len(X))

pred_lgb = np.zeros(len(test_df))
pred_cat = np.zeros(len(test_df))
pred_xgb = np.zeros(len(test_df))

# Model training

In [15]:
# Train the model
for fold, (trn_idx, val_idx) in enumerate(KF.split(X, y)):
    print(f"\n===== FOLD {fold+1} / 5 =====")

    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

    # LightGBM
    lgb = LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.02,
        num_leaves=64,
        colsample_bytree=0.8,
        subsample=0.8,
        random_state=42,
        class_weight="balanced"
    )
    lgb.fit(X_train, y_train)
    oof_lgb[val_idx] = lgb.predict_proba(X_valid)[:, 1]
    pred_lgb += lgb.predict_proba(test_df)[:, 1] / KF.n_splits

    # CatBoost
    cat = CatBoostClassifier(
        iterations=2000,
        depth=6,
        learning_rate=0.03,
        l2_leaf_reg=6,
        loss_function="Logloss",
        eval_metric="AUC",
        random_seed=42,
        verbose=False
    )
    cat.fit(X_train, y_train)
    oof_cat[val_idx] = cat.predict_proba(X_valid)[:, 1]
    pred_cat += cat.predict_proba(test_df)[:, 1] / KF.n_splits

    # XGBoost
    xgb = XGBClassifier(
        n_estimators=2000,
        learning_rate=0.02,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="auc",
        random_state=42,
        tree_method="hist"
    )
    xgb.fit(X_train, y_train)
    oof_xgb[val_idx] = xgb.predict_proba(X_valid)[:, 1]
    pred_xgb += xgb.predict_proba(test_df)[:, 1] / KF.n_splits


===== FOLD 1 / 5 =====
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1895
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

===== FOLD 2 / 5 =====
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1896
[LightGBM] [Info] Number of data points in the train set: 

# Model blending

In [16]:
# Merge the model statistics
oof_blend = 0.4 * oof_lgb + 0.35 * oof_cat + 0.25 * oof_xgb
pred_blend = 0.4 * pred_lgb + 0.35 * pred_cat + 0.25 * pred_xgb

print("\nLightGBM ROC: ", roc_auc_score(y, oof_lgb))
print("CatBoost ROC: ", roc_auc_score(y, oof_cat))
print("XGBoost ROC: ", roc_auc_score(y, oof_xgb))
print("Blended ROC: ", roc_auc_score(y, oof_blend))


LightGBM ROC:  0.7271809204666989
CatBoost ROC:  0.7257946453543997
XGBoost ROC:  0.7264871649112523
Blended ROC:  0.7273518723715541


# Stacked ROC

In [17]:
# Stack the ROC values
stack_train = np.vstack([oof_lgb, oof_cat, oof_xgb]).T
stack_test = np.vstack([pred_lgb, pred_cat, pred_xgb]).T

lvl2 = LogisticRegression(max_iter=2000)
lvl2.fit(stack_train, y)

pred_final = lvl2.predict_proba(stack_test)[:, 1]

print("\nFinal stacked ROC: ", roc_auc_score(y, lvl2.predict_proba(stack_train)[:, 1]))


Final stacked ROC:  0.7274491698926522


# Submission

In [18]:
# Create a submission file
submission = pd.DataFrame({
    "id": test_ids,
    "diagnosed_diabetes": pred_final
})

submission.to_csv("submission.csv", index=False)
print("Successfully saved as CSV file")

Successfully saved as CSV file
