In [340]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold


In [341]:
train_data = pd.read_csv('data/train.csv')
train_data.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [342]:
features = train_data.drop('diagnosed_diabetes', axis=1)
labels = train_data['diagnosed_diabetes'].astype(int)

features.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,102,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,124,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,108,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,123,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,124,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0


In [343]:
DROP_COLUMNS = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
features = features.drop(columns=DROP_COLUMNS, axis=1)
features.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,85,206,49,131,124,0,1,0


In [344]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 4000,
    "learning_rate": 0.1,
    "depth": 6,
    "l2_leaf_reg": 6,
    "random_strength": 1.0,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.8,
    "min_data_in_leaf": 50,
    "random_seed": 42,
    "verbose": 200,
    "task_type": "GPU"   # TURN GPU ON
}
test_data = pd.read_csv('data/test.csv').drop(DROP_COLUMNS, axis=1)
test_data.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,64,209,55,135,111,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,59,159,47,83,145,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,75,173,43,99,184,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,61,203,59,116,128,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,79,177,59,87,133,0,0,0


In [345]:
oof = np.zeros(len(features))
test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        early_stopping_rounds=100,
        use_best_model=True
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits
    


Fold 1
0:	test: 0.6754684	best: 0.6754684 (0)	total: 4.71ms	remaining: 18.8s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.7200760	best: 0.7200760 (200)	total: 743ms	remaining: 14s
400:	test: 0.7238455	best: 0.7238455 (400)	total: 1.51s	remaining: 13.6s
600:	test: 0.7250383	best: 0.7250383 (600)	total: 2.23s	remaining: 12.6s
800:	test: 0.7256433	best: 0.7256507 (799)	total: 2.96s	remaining: 11.8s
1000:	test: 0.7257885	best: 0.7258230 (992)	total: 3.68s	remaining: 11s
bestTest = 0.7258230448
bestIteration = 992
Shrink model to first 993 iterations.
Fold 2


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6745180	best: 0.6745180 (0)	total: 4.26ms	remaining: 17s
200:	test: 0.7184887	best: 0.7184887 (200)	total: 734ms	remaining: 13.9s
400:	test: 0.7220479	best: 0.7220479 (400)	total: 1.46s	remaining: 13.1s
600:	test: 0.7232659	best: 0.7232659 (600)	total: 2.18s	remaining: 12.3s
800:	test: 0.7236562	best: 0.7236562 (800)	total: 2.9s	remaining: 11.6s
1000:	test: 0.7240716	best: 0.7240880 (973)	total: 3.62s	remaining: 10.9s
1200:	test: 0.7242188	best: 0.7242188 (1200)	total: 4.35s	remaining: 10.1s
bestTest = 0.7242490649
bestIteration = 1229
Shrink model to first 1230 iterations.
Fold 3


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6803592	best: 0.6803592 (0)	total: 4.24ms	remaining: 17s
200:	test: 0.7187228	best: 0.7187228 (200)	total: 743ms	remaining: 14s
400:	test: 0.7226780	best: 0.7226780 (400)	total: 1.49s	remaining: 13.4s
600:	test: 0.7239983	best: 0.7239983 (600)	total: 2.24s	remaining: 12.7s
800:	test: 0.7245626	best: 0.7245651 (799)	total: 3s	remaining: 12s
1000:	test: 0.7249070	best: 0.7249177 (995)	total: 3.75s	remaining: 11.2s
1200:	test: 0.7251948	best: 0.7251948 (1200)	total: 4.5s	remaining: 10.5s
1400:	test: 0.7253011	best: 0.7253015 (1397)	total: 5.24s	remaining: 9.72s
1600:	test: 0.7253801	best: 0.7254019 (1587)	total: 5.98s	remaining: 8.96s
bestTest = 0.7254018784
bestIteration = 1587
Shrink model to first 1588 iterations.
Fold 4


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6759519	best: 0.6759519 (0)	total: 4.27ms	remaining: 17.1s
200:	test: 0.7204292	best: 0.7204292 (200)	total: 756ms	remaining: 14.3s
400:	test: 0.7244364	best: 0.7244364 (400)	total: 1.49s	remaining: 13.4s
600:	test: 0.7256759	best: 0.7256819 (599)	total: 2.25s	remaining: 12.7s
800:	test: 0.7261088	best: 0.7261114 (773)	total: 2.98s	remaining: 11.9s
1000:	test: 0.7264000	best: 0.7264037 (968)	total: 3.73s	remaining: 11.2s
1200:	test: 0.7265816	best: 0.7265993 (1171)	total: 4.48s	remaining: 10.4s
bestTest = 0.7265993357
bestIteration = 1171
Shrink model to first 1172 iterations.
Fold 5


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6750207	best: 0.6750207 (0)	total: 3.76ms	remaining: 15s
200:	test: 0.7196221	best: 0.7196221 (200)	total: 733ms	remaining: 13.9s
400:	test: 0.7235309	best: 0.7235309 (400)	total: 1.45s	remaining: 13s
600:	test: 0.7247284	best: 0.7247284 (600)	total: 2.15s	remaining: 12.2s
800:	test: 0.7252052	best: 0.7252471 (789)	total: 2.85s	remaining: 11.4s
1000:	test: 0.7253951	best: 0.7254029 (997)	total: 3.58s	remaining: 10.7s
1200:	test: 0.7255499	best: 0.7255670 (1185)	total: 4.31s	remaining: 10s
1400:	test: 0.7255838	best: 0.7256293 (1348)	total: 5.03s	remaining: 9.33s
bestTest = 0.7256292701
bestIteration = 1348
Shrink model to first 1349 iterations.


In [None]:
print("=== CatBoost Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f"预测的自信程度: {test_preds.mean():.4f}")

=== CatBoost Classifier Performance ===
准确率: 0.6832
精确率: 0.7057
召回率: 0.8435
F1 分数: 0.7685
ROC AUC: 0.7255
预测的自信程度: 0.6249


In [355]:
lgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.03,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': "binary",
    'random_state': 42,
}

In [359]:
oof = np.zeros(len(features))
test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = LGBMClassifier(**lgb_params)
    model.fit(
        X_train_fold, y_train_fold,
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits

Fold 1
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1871
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Fold 2
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1872
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> init

In [360]:
print("=== LightGBM Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f"预测的自信程度: {test_preds.mean():.4f}")

=== LightGBM Classifier Performance ===
准确率: 0.6842
精确率: 0.7089
召回率: 0.8369
F1 分数: 0.7676
ROC AUC: 0.7267
预测的自信程度: 0.6244


In [361]:
test_df = pd.DataFrame({
    'id': test_data['id'],
    'diagnosed_diabetes': (test_preds > 0.6).astype(int)
})
test_df.to_csv('data/submission.csv', index=False)
test_df.head()

Unnamed: 0,id,diagnosed_diabetes
0,700000,0
1,700001,1
2,700002,1
3,700003,0
4,700004,1
