In [437]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold


In [438]:
train_data = pd.read_csv('data/train.csv', index_col='id')
train_data.columns

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [439]:
features = train_data.drop('diagnosed_diabetes', axis=1)
labels = train_data['diagnosed_diabetes'].astype(int)

features.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,102,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,124,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,108,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,...,123,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,...,124,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0


In [440]:
DROP_COLUMNS = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
features = features.drop(columns=DROP_COLUMNS, axis=1)
features.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,85,206,49,131,124,0,1,0


In [441]:
# 分箱
from sklearn.preprocessing import KBinsDiscretizer

BINARY_COLUMNS = ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
ID_COL = ['id']

discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', random_state=42)
binned_features = discretizer.fit_transform(features.drop(columns=BINARY_COLUMNS, axis=1))
binned_features = pd.DataFrame(binned_features, columns=features.drop(columns=BINARY_COLUMNS, axis=1).columns)
features = pd.concat([binned_features, features[BINARY_COLUMNS]], axis=1)
features.head()



Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
0,0.0,0.0,0.0,3.0,2.0,1.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,0,0,0
1,2.0,0.0,0.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,0,0,0
2,0.0,1.0,1.0,4.0,3.0,2.0,1.0,2.0,0.0,3.0,2.0,2.0,2.0,2.0,1.0,0,0,0
3,2.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0,1,0
4,2.0,0.0,0.0,2.0,2.0,1.0,2.0,2.0,1.0,0.0,3.0,2.0,2.0,2.0,1.0,0,1,0


In [442]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 4000,
    "learning_rate": 0.05,
    "depth": 6,
    "l2_leaf_reg": 6,
    "random_strength": 1.0,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.8,
    "min_data_in_leaf": 50,
    "random_seed": 42,
    "verbose": 200,
    "task_type": "GPU"   # TURN GPU ON
}
test_data = pd.read_csv('data/test.csv', index_col='id').drop(DROP_COLUMNS, axis=1)
test_data.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,64,209,55,135,111,0,0,0
700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,59,159,47,83,145,0,0,0
700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,75,173,43,99,184,0,0,0
700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,61,203,59,116,128,0,0,0
700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,79,177,59,87,133,0,0,0


In [443]:
discretizer.transform(test_data.drop(columns=BINARY_COLUMNS, axis=1))
binned_test_features = discretizer.transform(test_data.drop(columns=BINARY_COLUMNS, axis=1))
binned_test_features = pd.DataFrame(binned_test_features, columns=test_data.drop(columns=BINARY_COLUMNS, axis=1).columns)
test_data = pd.concat([binned_test_features, test_data[BINARY_COLUMNS]], axis=1)
test_data.head()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
0,1.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,,,
1,1.0,0.0,0.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,,,
2,1.0,0.0,0.0,3.0,2.0,2.0,2.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,,,
3,2.0,0.0,0.0,3.0,3.0,1.0,2.0,3.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,,,
4,4.0,0.0,0.0,3.0,3.0,2.0,1.0,2.0,2.0,2.0,3.0,1.0,2.0,1.0,1.0,,,


In [444]:
oof = np.zeros(len(features))
cat_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        early_stopping_rounds=100,
        use_best_model=True
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    cat_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits
    


Fold 1
0:	test: 0.6676485	best: 0.6676485 (0)	total: 3.48ms	remaining: 13.9s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.6780719	best: 0.6780746 (193)	total: 709ms	remaining: 13.4s
bestTest = 0.6780854464
bestIteration = 297
Shrink model to first 298 iterations.
Fold 2


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6662667	best: 0.6662667 (0)	total: 4.53ms	remaining: 18.1s
200:	test: 0.6764141	best: 0.6764189 (198)	total: 723ms	remaining: 13.7s
bestTest = 0.6764189303
bestIteration = 198
Shrink model to first 199 iterations.
Fold 3


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6672244	best: 0.6672244 (0)	total: 3.83ms	remaining: 15.3s
200:	test: 0.6766171	best: 0.6766171 (200)	total: 738ms	remaining: 13.9s
400:	test: 0.6767280	best: 0.6767332 (394)	total: 1.46s	remaining: 13.1s
bestTest = 0.6767332256
bestIteration = 394
Shrink model to first 395 iterations.
Fold 4


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6675765	best: 0.6675765 (0)	total: 3.6ms	remaining: 14.4s
200:	test: 0.6784087	best: 0.6784102 (198)	total: 729ms	remaining: 13.8s
bestTest = 0.6784871221
bestIteration = 292
Shrink model to first 293 iterations.
Fold 5


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6684343	best: 0.6684343 (0)	total: 4.57ms	remaining: 18.3s
200:	test: 0.6794832	best: 0.6794855 (198)	total: 710ms	remaining: 13.4s
bestTest = 0.6795458496
bestIteration = 283
Shrink model to first 284 iterations.


In [445]:
print("=== CatBoost Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f"预测的自信程度: {cat_test_preds.mean():.4f}")

=== CatBoost Classifier Performance ===
准确率: 0.6533
精确率: 0.6681
召回率: 0.8816
F1 分数: 0.7602
ROC AUC: 0.6778
预测的自信程度: 0.5415


In [446]:
lgb_params = {
    'n_estimators': 200,
    'learning_rate': 0.03,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': "binary",
    'random_state': 42,
}

In [447]:
oof = np.zeros(len(features))
lgb_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = LGBMClassifier(**lgb_params)
    model.fit(
        X_train_fold, y_train_fold,
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    lgb_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits

Fold 1
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Fold 2
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 18
[Lig

In [448]:
print("=== LightGBM Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f"预测的自信程度: {cat_test_preds.mean():.4f}")

=== LightGBM Classifier Performance ===
准确率: 0.6530
精确率: 0.6683
召回率: 0.8798
F1 分数: 0.7596
ROC AUC: 0.6774
预测的自信程度: 0.5415


In [449]:
# 排名概率融合
from scipy.stats import rankdata
cat_test_df = pd.DataFrame(cat_test_preds, columns=["diagnosed_diabetes"])
lgb_test_df = pd.DataFrame(lgb_test_preds, columns=["diagnosed_diabetes"])
cat_test_df["rank"] = rankdata(cat_test_df["diagnosed_diabetes"])
lgb_test_df["rank"] = rankdata(lgb_test_df["diagnosed_diabetes"])

test_df = pd.DataFrame({
    'id': test_data.index
})
test_df["diagnosed_diabetes"] = 0.5 * cat_test_df["rank"] + 0.5 * lgb_test_df["rank"]
test_df["diagnosed_diabetes"] /= test_df["diagnosed_diabetes"].max()

test_df["diagnosed_diabetes"] = (test_df["diagnosed_diabetes"] > 0.5).astype(int)

test_df[["id", "diagnosed_diabetes"]].to_csv("data/submission.csv", index=False)
test_df.head()

Unnamed: 0,id,diagnosed_diabetes
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
