In [312]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold


In [313]:
ALPHA = 0.5

In [314]:
train_data = pd.read_csv('data/train.csv', index_col='id')
train_data.columns

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [315]:
features = train_data.drop('diagnosed_diabetes', axis=1)
labels = train_data['diagnosed_diabetes'].astype(int)

features.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,102,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,124,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,108,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,...,123,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,...,124,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0


In [316]:
DROP_COLUMNS = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
features = features.drop(columns=DROP_COLUMNS, axis=1)
features.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,85,206,49,131,124,0,1,0


In [317]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 4000,
    "learning_rate": 0.05,
    "depth": 6,
    "l2_leaf_reg": 6,
    "random_strength": 1.0,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.8,
    "min_data_in_leaf": 50,
    "random_seed": 42,
    "verbose": 200,
    "task_type": "GPU"   # TURN GPU ON
}
test_data = pd.read_csv('data/test.csv', index_col='id').drop(DROP_COLUMNS, axis=1)
test_data.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,64,209,55,135,111,0,0,0
700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,59,159,47,83,145,0,0,0
700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,75,173,43,99,184,0,0,0
700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,61,203,59,116,128,0,0,0
700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,79,177,59,87,133,0,0,0


In [318]:
feature_train, feature_test, label_train, label_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)

In [319]:
cat_model = CatBoostClassifier(**params)
cat_model.fit(
    feature_train, label_train,
    eval_set=(feature_test, label_test),
    early_stopping_rounds=100,
    use_best_model=True
)
test_preds = cat_model.predict_proba(feature_test)[:, 1]

print("=== CatBoost Classifier Performance ===")
accuracy = accuracy_score(label_test, (test_preds > ALPHA).astype(int))
precision = precision_score(label_test, (test_preds > ALPHA).astype(int))
recall = recall_score(label_test, (test_preds > ALPHA).astype(int))
f1 = f1_score(label_test, (test_preds > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, test_preds)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6811745	best: 0.6811745 (0)	total: 3.96ms	remaining: 15.8s
200:	test: 0.7120017	best: 0.7120017 (200)	total: 746ms	remaining: 14.1s
400:	test: 0.7191644	best: 0.7191644 (400)	total: 1.5s	remaining: 13.5s
600:	test: 0.7216254	best: 0.7216254 (600)	total: 2.22s	remaining: 12.6s
800:	test: 0.7229390	best: 0.7229390 (800)	total: 2.98s	remaining: 11.9s
1000:	test: 0.7236770	best: 0.7236770 (1000)	total: 3.74s	remaining: 11.2s
1200:	test: 0.7242139	best: 0.7242264 (1194)	total: 4.5s	remaining: 10.5s
1400:	test: 0.7246180	best: 0.7246180 (1400)	total: 5.26s	remaining: 9.75s
1600:	test: 0.7248430	best: 0.7248430 (1600)	total: 6s	remaining: 8.99s
1800:	test: 0.7249793	best: 0.7249801 (1793)	total: 6.75s	remaining: 8.24s
2000:	test: 0.7250689	best: 0.7250698 (1998)	total: 7.51s	remaining: 7.5s
2200:	test: 0.7251676	best: 0.7251676 (2200)	total: 8.26s	remaining: 6.75s
2400:	test: 0.7252058	best: 0.7252325 (2328)	total: 9.01s	remaining: 6s
bestTest = 0.7252619267
bestIteration = 2473
Sh

In [320]:
lgb_params = {
    'n_estimators': 200,
    'learning_rate': 0.03,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': "binary",
    'random_state': 42,
}

In [321]:
lgb_model = LGBMClassifier(**lgb_params)
lgb_model.fit(
    feature_train, label_train
)
lgb_test_preds = lgb_model.predict_proba(feature_test)[:, 1]
print("=== LightGBM Classifier Performance ===")
accuracy = accuracy_score(label_test, (lgb_test_preds > ALPHA).astype(int))
precision = precision_score(label_test, (lgb_test_preds > ALPHA).astype(int))
recall = recall_score(label_test, (lgb_test_preds > ALPHA).astype(int))
f1 = f1_score(label_test, (lgb_test_preds > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, lgb_test_preds)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1624
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623296 -> initscore=0.503564
[LightGBM] [Info] Start training from score 0.503564
=== LightGBM Classifier Performance ===
准确率: 0.6822
精确率: 0.7030
召回率: 0.8486
F1 分数: 0.7690
ROC AUC: 0.7230


In [322]:
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    random_state=42,
)
xgb_model.fit(
    feature_train, label_train
)
xgb_test_preds = xgb_model.predict_proba(feature_test)[:, 1]
print("=== XGBoost Classifier Performance ===")
accuracy = accuracy_score(label_test, (xgb_test_preds > ALPHA).astype(int))
precision = precision_score(label_test, (xgb_test_preds > ALPHA).astype(int))
recall = recall_score(label_test, (xgb_test_preds > ALPHA).astype(int))
f1 = f1_score(label_test, (xgb_test_preds > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, xgb_test_preds)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

=== XGBoost Classifier Performance ===
准确率: 0.6771
精确率: 0.6944
召回率: 0.8608
F1 分数: 0.7687
ROC AUC: 0.7155


In [323]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf.fit(
    feature_train, label_train
)
oof = rf.predict_proba(feature_test)[:, 1]
print("=== Random Forest Classifier Performance ===")
accuracy = accuracy_score(label_test, (oof > ALPHA).astype(int))
precision = precision_score(label_test, (oof > ALPHA).astype(int))
recall = recall_score(label_test, (oof > ALPHA).astype(int))
f1 = f1_score(label_test, (oof > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

=== Random Forest Classifier Performance ===
准确率: 0.6668
精确率: 0.6792
召回率: 0.8818
F1 分数: 0.7674
ROC AUC: 0.6998


In [324]:
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    max_iter=300,
    random_state=42,
    early_stopping=True,
    n_iter_no_change=20,
    verbose=True
)
mlp.fit(feature_train, label_train)
oof = mlp.predict_proba(feature_test)[:, 1]

print("=== Neural Network Classifier Performance ===")
accuracy = accuracy_score(label_test, (oof > ALPHA).astype(int))
precision = precision_score(label_test, (oof > ALPHA).astype(int))
recall = recall_score(label_test, (oof > ALPHA).astype(int))
f1 = f1_score(label_test, (oof > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

Iteration 1, loss = 0.65374391
Validation score: 0.658268
Iteration 2, loss = 0.62163321
Validation score: 0.658304
Iteration 3, loss = 0.61402951
Validation score: 0.635821
Iteration 4, loss = 0.61114231
Validation score: 0.660393
Iteration 5, loss = 0.60824560
Validation score: 0.660804
Iteration 6, loss = 0.60740800
Validation score: 0.660679
Iteration 7, loss = 0.60574904
Validation score: 0.661036
Iteration 8, loss = 0.60531453
Validation score: 0.662232
Iteration 9, loss = 0.60498401
Validation score: 0.662411
Iteration 10, loss = 0.60466407
Validation score: 0.659554
Iteration 11, loss = 0.60440393
Validation score: 0.661161




=== Neural Network Classifier Performance ===
准确率: 0.6649
精确率: 0.6806
召回率: 0.8713
F1 分数: 0.7642
ROC AUC: 0.6949


In [332]:
from scipy.stats import rankdata

cat_validation_preds = cat_model.predict_proba(feature_test)[:, 1]
lgb_validation_preds = lgb_model.predict_proba(feature_test)[:, 1]

validation_df = pd.DataFrame({
    'cat_preds': cat_validation_preds,
    'lgb_preds': lgb_validation_preds
})
validation_df["cat_rank"] = rankdata(validation_df["cat_preds"])
validation_df["lgb_rank"] = rankdata(validation_df["lgb_preds"])

k = 0.8

validation_df["final_preds"] = k * validation_df["cat_rank"] + (1 - k) * validation_df["lgb_rank"]
validation_df["final_preds"] /= validation_df["final_preds"].max()

print("=== Ensemble Model Performance ===")
accuracy = accuracy_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
precision = precision_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
recall = recall_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
f1 = f1_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, validation_df["final_preds"])
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')


=== Ensemble Model Performance ===
准确率: 0.6534
精确率: 0.7767
召回率: 0.6230
F1 分数: 0.6914
ROC AUC: 0.7254


In [336]:
validation_df = pd.DataFrame({
    'cat_preds': cat_validation_preds,
    'lgb_preds': lgb_validation_preds
})
validation_df["cat_norm"] = (validation_df["cat_preds"] - validation_df["cat_preds"].min()) / (validation_df["cat_preds"].max() - validation_df["cat_preds"].min())
validation_df["lgb_norm"] = (validation_df["lgb_preds"] - validation_df["lgb_preds"].min()) / (validation_df["lgb_preds"].max() - validation_df["lgb_preds"].min())

k = 0.2
validation_df["final_preds"] = k * validation_df["cat_norm"] + (1 - k) * validation_df["lgb_norm"]
print("=== Ensemble Model Performance (Normalized) ===")
accuracy = accuracy_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
precision = precision_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
recall = recall_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
f1 = f1_score(label_test, (validation_df["final_preds"] > ALPHA).astype(int))
roc_auc = roc_auc_score(label_test, validation_df["final_preds"])
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

=== Ensemble Model Performance (Normalized) ===
准确率: 0.6822
精确率: 0.7195
召回率: 0.8033
F1 分数: 0.7591
ROC AUC: 0.7240


In [None]:
# 排名概率融合
from scipy.stats import rankdata
cat_test_preds = cat_model.predict_proba(test_data)[:, 1]
lgb_test_preds = lgb_model.predict_proba(test_data)[:, 1]

cat_test_df = pd.DataFrame(cat_test_preds, columns=["diagnosed_diabetes"])
lgb_test_df = pd.DataFrame(lgb_test_preds, columns=["diagnosed_diabetes"])
cat_test_df["rank"] = rankdata(cat_test_df["diagnosed_diabetes"])
lgb_test_df["rank"] = rankdata(lgb_test_df["diagnosed_diabetes"])

test_df = pd.DataFrame({
    'id': test_data.index
})
test_df["diagnosed_diabetes"] = 0.5 * cat_test_df["rank"] + 0.5 * lgb_test_df["rank"]
test_df["diagnosed_diabetes"] /= test_df["diagnosed_diabetes"].max()

test_df["diagnosed_diabetes"] = (test_df["diagnosed_diabetes"] > ALPHA).astype(int)

test_df[["id", "diagnosed_diabetes"]].to_csv("data/submission.csv", index=False)
test_df.head()