In [263]:
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold


In [264]:
train_data = pd.read_csv('data/train.csv', index_col='id')
train_data.columns

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'gender', 'ethnicity', 'education_level',
       'income_level', 'smoking_status', 'employment_status',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'diagnosed_diabetes'],
      dtype='object')

In [265]:
features = train_data.drop('diagnosed_diabetes', axis=1)
labels = train_data['diagnosed_diabetes'].astype(int)

features.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,...,102,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,...,124,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,...,108,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,...,123,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,...,124,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0


In [266]:
DROP_COLUMNS = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
features = features.drop(columns=DROP_COLUMNS, axis=1)
features.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,70,62,199,58,114,102,0,0,0
1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,77,71,199,50,121,124,0,0,0
2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,89,73,188,59,114,108,0,0,0
3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,69,74,182,54,85,123,0,1,0
4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,60,85,206,49,131,124,0,1,0


In [267]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 4000,
    "learning_rate": 0.05,
    "depth": 6,
    "l2_leaf_reg": 6,
    "random_strength": 1.0,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.8,
    "min_data_in_leaf": 50,
    "random_seed": 42,
    "verbose": 200,
    "task_type": "GPU"   # TURN GPU ON
}
test_data = pd.read_csv('data/test.csv', index_col='id').drop(DROP_COLUMNS, axis=1)
test_data.head()

Unnamed: 0_level_0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,70,64,209,55,135,111,0,0,0
700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,74,59,159,47,83,145,0,0,0
700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,71,75,173,43,99,184,0,0,0
700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,81,61,203,59,116,128,0,0,0
700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,78,79,177,59,87,133,0,0,0


In [268]:
oof = np.zeros(len(features))
cat_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = CatBoostClassifier(**params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        early_stopping_rounds=100,
        use_best_model=True
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    cat_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits
    


Fold 1
0:	test: 0.6754684	best: 0.6754684 (0)	total: 4.2ms	remaining: 16.8s


Default metric period is 5 because AUC is/are not implemented for GPU


200:	test: 0.7125187	best: 0.7125187 (200)	total: 750ms	remaining: 14.2s
400:	test: 0.7199801	best: 0.7199801 (400)	total: 1.51s	remaining: 13.5s
600:	test: 0.7225996	best: 0.7225996 (600)	total: 2.27s	remaining: 12.9s
800:	test: 0.7237969	best: 0.7238049 (799)	total: 3.03s	remaining: 12.1s
1000:	test: 0.7246811	best: 0.7246811 (1000)	total: 3.78s	remaining: 11.3s
1200:	test: 0.7252959	best: 0.7252959 (1200)	total: 4.54s	remaining: 10.6s
1400:	test: 0.7256179	best: 0.7256179 (1400)	total: 5.32s	remaining: 9.87s
1600:	test: 0.7257763	best: 0.7257803 (1598)	total: 6.08s	remaining: 9.12s
1800:	test: 0.7259508	best: 0.7259508 (1800)	total: 6.87s	remaining: 8.38s
2000:	test: 0.7260779	best: 0.7260813 (1996)	total: 7.63s	remaining: 7.63s
2200:	test: 0.7261631	best: 0.7261717 (2173)	total: 8.39s	remaining: 6.86s
2400:	test: 0.7262314	best: 0.7262341 (2394)	total: 9.14s	remaining: 6.09s
2600:	test: 0.7262740	best: 0.7262802 (2567)	total: 9.89s	remaining: 5.32s
2800:	test: 0.7263910	best: 0.726

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6799040	best: 0.6799040 (0)	total: 4.19ms	remaining: 16.7s
200:	test: 0.7109334	best: 0.7109334 (200)	total: 753ms	remaining: 14.2s
400:	test: 0.7178888	best: 0.7178888 (400)	total: 1.51s	remaining: 13.6s
600:	test: 0.7207319	best: 0.7207319 (600)	total: 2.28s	remaining: 12.9s
800:	test: 0.7219473	best: 0.7219480 (799)	total: 3.04s	remaining: 12.1s
1000:	test: 0.7227073	best: 0.7227088 (998)	total: 3.82s	remaining: 11.4s
1200:	test: 0.7231665	best: 0.7231665 (1200)	total: 4.59s	remaining: 10.7s
1400:	test: 0.7234616	best: 0.7234685 (1397)	total: 5.34s	remaining: 9.91s
1600:	test: 0.7237249	best: 0.7237249 (1600)	total: 6.12s	remaining: 9.17s
1800:	test: 0.7238597	best: 0.7238615 (1795)	total: 6.88s	remaining: 8.4s
2000:	test: 0.7239773	best: 0.7239892 (1978)	total: 7.64s	remaining: 7.63s
2200:	test: 0.7240416	best: 0.7240580 (2182)	total: 8.41s	remaining: 6.87s
2400:	test: 0.7241383	best: 0.7241479 (2392)	total: 9.16s	remaining: 6.1s
2600:	test: 0.7242265	best: 0.7242269 (25

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6808420	best: 0.6808420 (0)	total: 4.08ms	remaining: 16.3s
200:	test: 0.7110625	best: 0.7110625 (200)	total: 747ms	remaining: 14.1s
400:	test: 0.7189298	best: 0.7189298 (400)	total: 1.49s	remaining: 13.4s
600:	test: 0.7214191	best: 0.7214191 (600)	total: 2.25s	remaining: 12.7s
800:	test: 0.7228023	best: 0.7228023 (800)	total: 3.01s	remaining: 12s
1000:	test: 0.7235882	best: 0.7235894 (999)	total: 3.76s	remaining: 11.3s
1200:	test: 0.7241095	best: 0.7241112 (1199)	total: 4.5s	remaining: 10.5s
1400:	test: 0.7245012	best: 0.7245012 (1400)	total: 5.26s	remaining: 9.77s
1600:	test: 0.7248222	best: 0.7248271 (1598)	total: 6.02s	remaining: 9.02s
1800:	test: 0.7250589	best: 0.7250589 (1800)	total: 6.8s	remaining: 8.3s
2000:	test: 0.7252201	best: 0.7252257 (1998)	total: 7.55s	remaining: 7.54s
2200:	test: 0.7253965	best: 0.7254040 (2192)	total: 8.32s	remaining: 6.8s
2400:	test: 0.7254754	best: 0.7254818 (2397)	total: 9.1s	remaining: 6.06s
bestTest = 0.7254818082
bestIteration = 2397
S

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6798570	best: 0.6798570 (0)	total: 3.82ms	remaining: 15.3s
200:	test: 0.7118572	best: 0.7118572 (200)	total: 746ms	remaining: 14.1s
400:	test: 0.7199318	best: 0.7199318 (400)	total: 1.49s	remaining: 13.3s
600:	test: 0.7228732	best: 0.7228732 (600)	total: 2.24s	remaining: 12.7s
800:	test: 0.7240799	best: 0.7240799 (800)	total: 3s	remaining: 12s
1000:	test: 0.7248726	best: 0.7248726 (1000)	total: 3.75s	remaining: 11.2s
1200:	test: 0.7253633	best: 0.7253633 (1200)	total: 4.5s	remaining: 10.5s
1400:	test: 0.7258160	best: 0.7258194 (1399)	total: 5.25s	remaining: 9.75s
1600:	test: 0.7260047	best: 0.7260056 (1599)	total: 6.01s	remaining: 9s
1800:	test: 0.7262214	best: 0.7262214 (1800)	total: 6.78s	remaining: 8.28s
2000:	test: 0.7263849	best: 0.7263851 (1995)	total: 7.53s	remaining: 7.52s
2200:	test: 0.7264466	best: 0.7264557 (2174)	total: 8.3s	remaining: 6.78s
2400:	test: 0.7265776	best: 0.7265776 (2400)	total: 9.06s	remaining: 6.04s
2600:	test: 0.7266550	best: 0.7266550 (2600)	tot

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6752357	best: 0.6752357 (0)	total: 3.86ms	remaining: 15.4s
200:	test: 0.7123778	best: 0.7123778 (200)	total: 740ms	remaining: 14s
400:	test: 0.7192063	best: 0.7192063 (400)	total: 1.49s	remaining: 13.4s
600:	test: 0.7222739	best: 0.7222739 (600)	total: 2.25s	remaining: 12.7s
800:	test: 0.7235574	best: 0.7235574 (800)	total: 2.99s	remaining: 11.9s
1000:	test: 0.7243605	best: 0.7243685 (996)	total: 3.74s	remaining: 11.2s
1200:	test: 0.7248076	best: 0.7248081 (1198)	total: 4.49s	remaining: 10.5s
1400:	test: 0.7251959	best: 0.7251959 (1400)	total: 5.24s	remaining: 9.71s
1600:	test: 0.7254881	best: 0.7254973 (1586)	total: 5.98s	remaining: 8.96s
1800:	test: 0.7256778	best: 0.7256922 (1792)	total: 6.73s	remaining: 8.21s
2000:	test: 0.7258176	best: 0.7258210 (1996)	total: 7.47s	remaining: 7.47s
2200:	test: 0.7258784	best: 0.7258877 (2153)	total: 8.24s	remaining: 6.73s
2400:	test: 0.7260129	best: 0.7260177 (2398)	total: 8.98s	remaining: 5.98s
2600:	test: 0.7260927	best: 0.7260927 (26

In [269]:
print("=== CatBoost Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

=== CatBoost Classifier Performance ===
准确率: 0.6835
精确率: 0.7063
召回率: 0.8427
F1 分数: 0.7685
ROC AUC: 0.7258


In [270]:
lgb_params = {
    'n_estimators': 200,
    'learning_rate': 0.03,
    'num_leaves': 64,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': "binary",
    'random_state': 42,
}

In [271]:
oof = np.zeros(len(features))
lgb_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = LGBMClassifier(**lgb_params)
    model.fit(
        X_train_fold, y_train_fold,
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    lgb_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits

Fold 1
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1616
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503556
[LightGBM] [Info] Start training from score 0.503556
Fold 2
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1617
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 18


In [272]:
print("=== LightGBM Classifier Performance ===")

accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

=== LightGBM Classifier Performance ===
准确率: 0.6818
精确率: 0.7027
召回率: 0.8485
F1 分数: 0.7687
ROC AUC: 0.7230


In [273]:
oof = np.zeros(len(features))
lgb_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        random_state=42,
    )
    model.fit(
        X_train_fold, y_train_fold,
    )
    oof[val_index] = model.predict_proba(X_val_fold)[:, 1]
    lgb_test_preds += model.predict_proba(test_data)[:, 1] / skf.n_splits

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [274]:
print("=== XGBoost Classifier Performance ===")
accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

=== XGBoost Classifier Performance ===
准确率: 0.6764
精确率: 0.6939
召回率: 0.8602
F1 分数: 0.7682
ROC AUC: 0.7156


In [275]:
oof = np.zeros(len(features))
rf_test_preds = np.zeros(len(test_data))
for fold, (train_index, val_index) in enumerate(skf.split(features, labels)):
    print(f"Fold {fold + 1}")
    X_train_fold, X_val_fold = features.iloc[train_index], features.iloc[val_index]
    y_train_fold, y_val_fold = labels.iloc[train_index], labels.iloc[val_index]
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train_fold, y_train_fold)
    oof[val_index] = rf.predict_proba(X_val_fold)[:, 1]
    rf_test_preds += rf.predict_proba(test_data)[:, 1] / skf.n_splits
print("=== Random Forest Classifier Performance ===")
accuracy = accuracy_score(labels, (oof > 0.5).astype(int))
precision = precision_score(labels, (oof > 0.5).astype(int))
recall = recall_score(labels, (oof > 0.5).astype(int))
f1 = f1_score(labels, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(labels, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
=== Random Forest Classifier Performance ===
准确率: 0.6661
精确率: 0.6785
召回率: 0.8823
F1 分数: 0.7671
ROC AUC: 0.6993


In [276]:
feature_train, feature_test, label_train, label_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)

In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    max_iter=300,
    random_state=42,
    early_stopping=True,
    n_iter_no_change=20,
    verbose=True
)
mlp.fit(feature_train, label_train)
oof = mlp.predict_proba(feature_test)[:, 1]

print("=== Neural Network Classifier Performance ===")
accuracy = accuracy_score(label_test, (oof > 0.5).astype(int))
precision = precision_score(label_test, (oof > 0.5).astype(int))
recall = recall_score(label_test, (oof > 0.5).astype(int))
f1 = f1_score(label_test, (oof > 0.5).astype(int))
roc_auc = roc_auc_score(label_test, oof)
print(f'准确率: {accuracy:.4f}')
print(f'精确率: {precision:.4f}')
print(f'召回率: {recall:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

Iteration 1, loss = 0.65374391
Validation score: 0.658268
Iteration 2, loss = 0.62163321
Validation score: 0.658304
Iteration 3, loss = 0.61402951
Validation score: 0.635821
Iteration 4, loss = 0.61114231
Validation score: 0.660393
Iteration 5, loss = 0.60824560
Validation score: 0.660804
Iteration 6, loss = 0.60740800
Validation score: 0.660679
Iteration 7, loss = 0.60574904
Validation score: 0.661036
Iteration 8, loss = 0.60531453
Validation score: 0.662232
Iteration 9, loss = 0.60498401
Validation score: 0.662411
Iteration 10, loss = 0.60466407
Validation score: 0.659554
Iteration 11, loss = 0.60440393
Validation score: 0.661161
Iteration 12, loss = 0.60430186
Validation score: 0.662839
Iteration 13, loss = 0.60418819
Validation score: 0.661107
Iteration 14, loss = 0.60394921
Validation score: 0.660589
Iteration 15, loss = 0.60387005
Validation score: 0.663089
Iteration 16, loss = 0.60400232
Validation score: 0.662482
Iteration 17, loss = 0.60379165
Validation score: 0.662714
Iterat

In [143]:
# 排名概率融合
from scipy.stats import rankdata
cat_test_df = pd.DataFrame(cat_test_preds, columns=["diagnosed_diabetes"])
lgb_test_df = pd.DataFrame(lgb_test_preds, columns=["diagnosed_diabetes"])
cat_test_df["rank"] = rankdata(cat_test_df["diagnosed_diabetes"])
lgb_test_df["rank"] = rankdata(lgb_test_df["diagnosed_diabetes"])

test_df = pd.DataFrame({
    'id': test_data.index
})
test_df["diagnosed_diabetes"] = 0.5 * cat_test_df["rank"] + 0.5 * lgb_test_df["rank"]
test_df["diagnosed_diabetes"] /= test_df["diagnosed_diabetes"].max()

test_df["diagnosed_diabetes"] = (test_df["diagnosed_diabetes"] > 0.5).astype(int)

test_df[["id", "diagnosed_diabetes"]].to_csv("data/submission.csv", index=False)
test_df.head()

Unnamed: 0,id,diagnosed_diabetes
0,700000,0
1,700001,1
2,700002,1
3,700003,0
4,700004,1
