In [362]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold


In [363]:
train_data = pd.read_csv('data/train.csv')
train_data.columns

Index(['id', 'age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [364]:
features = train_data.drop('diagnosed_diabetes', axis=1)
labels = train_data['diagnosed_diabetes'].astype(int)

features.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,102,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,124,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,108,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,123,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,124,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0


In [365]:
DROP_COLUMNS = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
features = features.drop(columns=DROP_COLUMNS, axis=1)
features.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,85,206,49,131,124,0,1,0


In [366]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 4000,
    "learning_rate": 0.1,
    "depth": 6,
    "l2_leaf_reg": 6,
    "random_strength": 1.0,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.8,
    "min_data_in_leaf": 50,
    "random_seed": 42,
    "verbose": 200,
    "task_type": "GPU"   # TURN GPU ON
}
test_data = pd.read_csv('data/test.csv').drop(DROP_COLUMNS, axis=1)
test_data.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,64,209,55,135,111,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,59,159,47,83,145,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,75,173,43,99,184,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,61,203,59,116,128,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,79,177,59,87,133,0,0,0


In [367]:
oof = np.zeros(len(features))
cat_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        early_stopping_rounds=100,
        use_best_model=True
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    cat_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits
    


Fold 1
0:	test: 0.6754684	best: 0.6754684 (0)	total: 3.74ms	remaining: 15s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.7198502	best: 0.7198502 (200)	total: 769ms	remaining: 14.5s
400:	test: 0.7240083	best: 0.7240083 (400)	total: 1.55s	remaining: 13.9s
600:	test: 0.7252448	best: 0.7252448 (600)	total: 2.32s	remaining: 13.1s
800:	test: 0.7257369	best: 0.7257369 (800)	total: 3.09s	remaining: 12.3s
1000:	test: 0.7259387	best: 0.7259533 (991)	total: 3.86s	remaining: 11.6s
1200:	test: 0.7260693	best: 0.7260774 (1193)	total: 4.63s	remaining: 10.8s
bestTest = 0.7261635065
bestIteration = 1288
Shrink model to first 1289 iterations.
Fold 2


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6745180	best: 0.6745180 (0)	total: 4.03ms	remaining: 16.1s
200:	test: 0.7184886	best: 0.7184886 (200)	total: 757ms	remaining: 14.3s
400:	test: 0.7220796	best: 0.7220796 (400)	total: 1.52s	remaining: 13.6s
600:	test: 0.7234920	best: 0.7234960 (596)	total: 2.27s	remaining: 12.8s
800:	test: 0.7238503	best: 0.7238584 (799)	total: 3.02s	remaining: 12.1s
1000:	test: 0.7240846	best: 0.7241241 (989)	total: 3.76s	remaining: 11.3s
1200:	test: 0.7242264	best: 0.7242405 (1135)	total: 4.51s	remaining: 10.5s
bestTest = 0.724250108
bestIteration = 1232
Shrink model to first 1233 iterations.
Fold 3


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6803592	best: 0.6803592 (0)	total: 3.74ms	remaining: 15s
200:	test: 0.7190200	best: 0.7190200 (200)	total: 723ms	remaining: 13.7s
400:	test: 0.7228351	best: 0.7228351 (400)	total: 1.45s	remaining: 13s
600:	test: 0.7238551	best: 0.7238626 (599)	total: 2.19s	remaining: 12.4s
800:	test: 0.7243946	best: 0.7243946 (800)	total: 2.93s	remaining: 11.7s
1000:	test: 0.7246622	best: 0.7246829 (964)	total: 3.66s	remaining: 11s
1200:	test: 0.7247641	best: 0.7247721 (1114)	total: 4.38s	remaining: 10.2s
bestTest = 0.7247720957
bestIteration = 1114
Shrink model to first 1115 iterations.
Fold 4


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6759519	best: 0.6759519 (0)	total: 4.92ms	remaining: 19.7s
200:	test: 0.7200459	best: 0.7200459 (200)	total: 740ms	remaining: 14s
400:	test: 0.7243820	best: 0.7243868 (399)	total: 1.46s	remaining: 13.1s
600:	test: 0.7254893	best: 0.7254907 (599)	total: 2.19s	remaining: 12.4s
800:	test: 0.7261855	best: 0.7261862 (799)	total: 2.94s	remaining: 11.7s
1000:	test: 0.7263835	best: 0.7264129 (974)	total: 3.68s	remaining: 11s
1200:	test: 0.7265701	best: 0.7265718 (1185)	total: 4.43s	remaining: 10.3s
1400:	test: 0.7265377	best: 0.7266107 (1335)	total: 5.18s	remaining: 9.62s
bestTest = 0.7266107202
bestIteration = 1335
Shrink model to first 1336 iterations.
Fold 5
0:	test: 0.6750207	best: 0.6750207 (0)	total: 3.73ms	remaining: 14.9s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.7196210	best: 0.7196210 (200)	total: 736ms	remaining: 13.9s
400:	test: 0.7236236	best: 0.7236236 (400)	total: 1.48s	remaining: 13.3s
600:	test: 0.7247748	best: 0.7247864 (594)	total: 2.21s	remaining: 12.5s
800:	test: 0.7253340	best: 0.7253780 (789)	total: 2.95s	remaining: 11.8s
1000:	test: 0.7254855	best: 0.7255008 (995)	total: 3.69s	remaining: 11.1s
1200:	test: 0.7255538	best: 0.7255604 (1115)	total: 4.44s	remaining: 10.4s
bestTest = 0.7255907357
bestIteration = 1292
Shrink model to first 1293 iterations.


In [368]:
print("=== CatBoost Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f"预测的自信程度: {cat_test_preds.mean():.4f}")

=== CatBoost Classifier Performance ===
准确率: 0.6832
精确率: 0.7058
召回率: 0.8432
F1 分数: 0.7684
ROC AUC: 0.7255
预测的自信程度: 0.6248


In [369]:
lgb_params = {
    'n_estimators': 3000,
    'learning_rate': 0.03,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': "binary",
    'random_state': 42,
}

In [370]:
oof = np.zeros(len(features))
lgb_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = LGBMClassifier(**lgb_params)
    model.fit(
        X_train_fold, y_train_fold,
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    lgb_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits

Fold 1
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1871
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Fold 2
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007985 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1872
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 19


In [371]:
print("=== LightGBM Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f"预测的自信程度: {cat_test_preds.mean():.4f}")

=== LightGBM Classifier Performance ===
准确率: 0.6842
精确率: 0.7089
召回率: 0.8369
F1 分数: 0.7676
ROC AUC: 0.7267
预测的自信程度: 0.6248


In [373]:
cat_test_preds

array([0.52501945, 0.70522766, 0.75874745, ..., 0.69105839, 0.62496566,
       0.61158939])

In [None]:
# 排名概率融合
from scipy.stats import rankdata
cat_test_df = pd.DataFrame(cat_test_preds, columns=["diagnosed_diabetes"])
lgb_test_df = pd.DataFrame(lgb_test_preds, columns=["diagnosed_diabetes"])
cat_test_df["rank"] = rankdata(cat_test_df["diagnosed_diabetes"])
lgb_test_df["rank"] = rankdata(lgb_test_df["diagnosed_diabetes"])

test_df = pd.DataFrame({
    'id': test_data['id']
})
test_df["diagnosed_diabetes"] = 0.5 * cat_test_df["rank"] + 0.5 * lgb_test_df["rank"]
test_df["diagnosed_diabetes"] /= test_df["diagnosed_diabetes"].max()

test_df["diagnosed_diabetes"] = (test_df["diagnosed_diabetes"] > 0.6).astype(int)

test_df[["id", "diagnosed_diabetes"]].to_csv("data/submission.csv", index=False)
test_df.head()

Unnamed: 0,id,diagnosed_diabetes
0,700000,0
1,700001,0
2,700002,1
3,700003,0
4,700004,1
