In [2]:
from utils.data_loading import load_dataset
import pandas as pd

In [2]:
use_selected_cols = True
with_gene = True

In [3]:
internal_dataset = './dataset/复旦儿科_135年_特征1.csv'
external_dataset = './dataset/外院_135年_特征1.csv'

## Training

### 所有特征

In [3]:
from models.xgb import train_xgboost_classifier
from models.rf import train_rf_classifier
from models.svm import train_svm_classifier
from models.knn_classifier import train_knn_classifier
from models.ann import train_ann_classifier
from models.gbdt import train_gbdt_classifier
from models.gbm import train_gbm_classifier
from models.lightgbm import train_lightgbm_classifier
from models.adaboost import train_adaboost_classifier
from models.ngboost import train_ngboost_classifier
from models.catboost_classifier import train_catboost_classifier

from utils.eval import eval_model
from sklearn.model_selection import train_test_split
import joblib

In [4]:
model_dict = {'xgb': train_xgboost_classifier,
               'rf': train_rf_classifier,
               'svm': train_svm_classifier,
               'knn': train_knn_classifier,
               'ann': train_ann_classifier,
              'catboost': train_catboost_classifier,
              'gbm': train_gbm_classifier,
              'adaboost': train_adaboost_classifier,
            }


In [6]:
internal_results = []
external_results = [] 

In [7]:
cols = ['gender (1/0)', 'family_history (1/0)', 'preterm_birth (1/0)',
       'prenatal_phenotype (1/0)', 'cakut_subphenotype',
       'behavioral_cognitive_abnormalities (1/0)', 'motor_retardation (1/0)',
       'congenital_heart_disease (1/0)', 'skeletal_anormalies (1/0)',
       'genitoreproductive (1/0)', 'central_nervous_system (1/0)',
       'face (1/0)', 'hearing (1/0)', 'ocular (1/0)',
       'external_ear (1/0)', 'gastrointestinal_tract (1/0)',
       'age_first_diagnose',
       'ckd_stage_first_diagnose', 'short_stature (1/0)', 'hyperuricemia(1/0)',
       'CNV', 'Chromosomal_abnormality', 'PAX2', 'TNXB', 'EYA1', 'HNF1β', 'GATA3', 'SALL1', 'COL4A1', 'Other_gene',
       'esrd_1y', 'esrd_3y', 'esrd_5y']

In [7]:
numerical_categories = ['age_first_diagnose']

In [9]:
for year in [1,3,5]:
    for model_name, train_methods in model_dict.items():
        internal_X, internal_y = load_dataset(internal_dataset, year, cols)
        external_X, external_y = load_dataset(external_dataset, year, cols)

        print(year)
        X_train, X_test, y_train, y_test = train_test_split(internal_X, internal_y, test_size=0.3, random_state=42, stratify=internal_y)
        if model_name == 'gbdt':
            categorical_features = []
            for i, col in enumerate(X_train.columns):
                if col not in numerical_categories:
                    categorical_features.append(i)
            best_model = train_methods(X_train, y_train, categorical_features)
        else:
            best_model = train_methods(X_train, y_train)
        joblib.dump(best_model, f'./output/all/{model_name}_{year}yr.pkl')
        
        internal_metrics = eval_model(best_model, X_test, y_test)
        external_metrics = eval_model(best_model, external_X, external_y)
        internal_metrics['Model'] = model_name
        internal_metrics['Year'] = year
        external_metrics['Model'] = model_name
        external_metrics['Year'] = year
        internal_results.append(internal_metrics)
        external_results.append(external_metrics)
        print("*"*50)

1
Fitting 10 folds for each of 972 candidates, totalling 9720 fits
{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}
AUC: 0.9944
0.9944(0.9786-1.0000)
Accuracy: 0.9932
Precision: 1.0000
Recall (Sensitivity): 0.9167
Specificity: 1.0000
F1 Score: 0.9565
AUC: 0.9628
0.9628(0.9040-1.0000)
Accuracy: 0.9412
Precision: 0.9000
Recall (Sensitivity): 0.8571
Specificity: 0.9688
F1 Score: 0.8780
**************************************************
1
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
{'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
AUC: 0.9914
0.9914(0.9715-1.0000)
Accuracy: 0.9660
Precision: 0.7333
Recall (Sensitivity): 0.9167
Specificity: 0.9704
F1 Score: 0.8148
AUC: 0.9740
0.9740(0.9379-1.0000)
Accuracy: 0.9294
Precision: 0.8947
Recall (Sensitivity): 0.8095
Specificity: 0.9688
F1 Score: 0.8500
**********



AUC: 0.9821
0.9821(0.9527-0.9987)
Accuracy: 0.9592
Precision: 0.8750
Recall (Sensitivity): 0.5833
Specificity: 0.9926
F1 Score: 0.7000




AUC: 0.8958
0.8958(0.8083-0.9585)
Accuracy: 0.8824
Precision: 0.9231
Recall (Sensitivity): 0.5714
Specificity: 0.9844
F1 Score: 0.7059
**************************************************
1
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'border_count': 50, 'depth': 5, 'iterations': 50, 'l2_leaf_reg': 10, 'learning_rate': 0.1, 'subsample': 0.8}
AUC: 0.9944
0.9944(0.9813-1.0000)
Accuracy: 0.9728
Precision: 0.8333
Recall (Sensitivity): 0.8333
Specificity: 0.9852
F1 Score: 0.8333
AUC: 0.9673
0.9673(0.9169-1.0000)
Accuracy: 0.9412
Precision: 0.9000
Recall (Sensitivity): 0.8571
Specificity: 0.9688
F1 Score: 0.8780
**************************************************
1
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.1, 'max_depth': 5, 'max_features': 0.8, 'n_estimators': 50, 'subsample': 1.0}
AUC: 0.9864
0.9864(0.9613-1.0000)
Accuracy: 0.9660
Precision: 0.8182
Recall (Sensitivity): 0.7500
Specificity: 0.9852
F1 Score: 0.7826
AUC: 0.9022
0.902



{'estimator__max_depth': 1, 'learning_rate': 0.1, 'n_estimators': 100}
AUC: 0.9938
0.9938(0.9799-1.0000)
Accuracy: 0.9864
Precision: 1.0000
Recall (Sensitivity): 0.8333
Specificity: 1.0000
F1 Score: 0.9091
AUC: 0.9550
0.9550(0.8934-1.0000)
Accuracy: 0.9412
Precision: 0.9444
Recall (Sensitivity): 0.8095
Specificity: 0.9844
F1 Score: 0.8718
**************************************************
3
Fitting 10 folds for each of 972 candidates, totalling 9720 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 5, 'subsample': 1.0}
AUC: 0.9589
0.9589(0.9063-0.9975)
Accuracy: 0.9441
Precision: 0.7222
Recall (Sensitivity): 0.8125
Specificity: 0.9606
F1 Score: 0.7647
AUC: 0.8973
0.8973(0.8114-0.9699)
Accuracy: 0.8500
Precision: 0.9524
Recall (Sensitivity): 0.7143
Specificity: 0.9688
F1 Score: 0.8163
**************************************************
3
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
{'bootstrap': Tr



AUC: 0.9464
0.9464(0.9057-0.9800)
Accuracy: 0.9091
Precision: 0.6667
Recall (Sensitivity): 0.3750
Specificity: 0.9764
F1 Score: 0.4800




AUC: 0.8527
0.8527(0.7333-0.9433)
Accuracy: 0.8000
Precision: 1.0000
Recall (Sensitivity): 0.5714
Specificity: 1.0000
F1 Score: 0.7273
**************************************************
3
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'border_count': 100, 'depth': 3, 'iterations': 100, 'l2_leaf_reg': 3, 'learning_rate': 0.01, 'subsample': 0.8}
AUC: 0.9838
0.9838(0.9621-0.9987)
Accuracy: 0.9441
Precision: 0.7222
Recall (Sensitivity): 0.8125
Specificity: 0.9606
F1 Score: 0.7647
AUC: 0.8951
0.8951(0.8036-0.9725)
Accuracy: 0.8333
Precision: 0.8750
Recall (Sensitivity): 0.7500
Specificity: 0.9062
F1 Score: 0.8077
**************************************************
3
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.01, 'max_depth': 3, 'max_features': 0.8, 'n_estimators': 50, 'subsample': 0.8}
AUC: 0.9756
0.9756(0.9446-0.9982)
Accuracy: 0.9510
Precision: 0.8000
Recall (Sensitivity): 0.7500
Specificity: 0.9764
F1 Score: 0.7742
AUC: 0.9079
0.



{'estimator__max_depth': 1, 'learning_rate': 0.1, 'n_estimators': 50}
AUC: 0.9796
0.9796(0.9518-0.9990)
Accuracy: 0.9510
Precision: 1.0000
Recall (Sensitivity): 0.5625
Specificity: 1.0000
F1 Score: 0.7200
AUC: 0.8929
0.8929(0.8042-0.9717)
Accuracy: 0.8333
Precision: 1.0000
Recall (Sensitivity): 0.6429
Specificity: 1.0000
F1 Score: 0.7826
**************************************************
5
Fitting 10 folds for each of 972 candidates, totalling 9720 fits
{'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 1.0}
AUC: 0.9100
0.9100(0.8123-0.9765)
Accuracy: 0.8632
Precision: 0.5926
Recall (Sensitivity): 0.7619
Specificity: 0.8854
F1 Score: 0.6667
AUC: 0.8890
0.8890(0.7806-0.9722)
Accuracy: 0.8333
Precision: 0.8889
Recall (Sensitivity): 0.8571
Specificity: 0.7857
F1 Score: 0.8727
**************************************************
5
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
{'bootstrap': True,



AUC: 0.9142
0.9142(0.8273-0.9717)
Accuracy: 0.8889
Precision: 0.7500
Recall (Sensitivity): 0.5714
Specificity: 0.9583
F1 Score: 0.6486




AUC: 0.8673
0.8673(0.7477-0.9600)
Accuracy: 0.7857
Precision: 1.0000
Recall (Sensitivity): 0.6786
Specificity: 1.0000
F1 Score: 0.8085
**************************************************
5
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'border_count': 50, 'depth': 3, 'iterations': 50, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'subsample': 0.8}
AUC: 0.9184
0.9184(0.8252-0.9797)
Accuracy: 0.8803
Precision: 0.6296
Recall (Sensitivity): 0.8095
Specificity: 0.8958
F1 Score: 0.7083
AUC: 0.9056
0.9056(0.8070-0.9812)
Accuracy: 0.7857
Precision: 0.8800
Recall (Sensitivity): 0.7857
Specificity: 0.7857
F1 Score: 0.8302
**************************************************
5
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 1.0, 'n_estimators': 50, 'subsample': 0.8}
AUC: 0.9194
0.9194(0.8281-0.9801)
Accuracy: 0.8718
Precision: 0.6071
Recall (Sensitivity): 0.8095
Specificity: 0.8854
F1 Score: 0.6939
AUC: 0.8967
0.8967



{'estimator__max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100}
AUC: 0.9045
0.9045(0.8030-0.9720)
Accuracy: 0.8803
Precision: 0.6842
Recall (Sensitivity): 0.6190
Specificity: 0.9375
F1 Score: 0.6500
AUC: 0.8954
0.8954(0.7860-0.9766)
Accuracy: 0.8095
Precision: 0.9545
Recall (Sensitivity): 0.7500
Specificity: 0.9286
F1 Score: 0.8400
**************************************************


In [11]:
internal_results_df = pd.DataFrame(internal_results)
external_results_df = pd.DataFrame(external_results)

In [12]:
internal_results_df

Unnamed: 0,AUC,Accuracy,Precision,Recall,Specificity,F1,Model,Year
0,0.9944(0.9786-1.0000),0.9932,1.0,0.9167,1.0,0.9565,xgb,1
1,0.9914(0.9715-1.0000),0.966,0.7333,0.9167,0.9704,0.8148,rf,1
2,0.9889(0.9667-1.0000),0.898,0.44,0.9167,0.8963,0.5946,svm,1
3,0.9827(0.9593-0.9984),0.9252,0.6667,0.1667,0.9926,0.2667,knn,1
4,0.9821(0.9527-0.9987),0.9592,0.875,0.5833,0.9926,0.7,ann,1
5,0.9944(0.9813-1.0000),0.9728,0.8333,0.8333,0.9852,0.8333,catboost,1
6,0.9864(0.9613-1.0000),0.966,0.8182,0.75,0.9852,0.7826,gbm,1
7,0.9938(0.9799-1.0000),0.9864,1.0,0.8333,1.0,0.9091,adaboost,1
8,0.9589(0.9063-0.9975),0.9441,0.7222,0.8125,0.9606,0.7647,xgb,3
9,0.9783(0.9423-0.9993),0.958,0.8125,0.8125,0.9764,0.8125,rf,3


In [13]:
external_results_df

Unnamed: 0,AUC,Accuracy,Precision,Recall,Specificity,F1,Model,Year
0,0.9628(0.9040-1.0000),0.9412,0.9,0.8571,0.9688,0.878,xgb,1
1,0.9740(0.9379-1.0000),0.9294,0.8947,0.8095,0.9688,0.85,rf,1
2,0.9844(0.9579-1.0000),0.8941,0.7,1.0,0.8594,0.8235,svm,1
3,0.9375(0.8652-0.9962),0.8706,0.9167,0.5238,0.9844,0.6667,knn,1
4,0.8958(0.8083-0.9585),0.8824,0.9231,0.5714,0.9844,0.7059,ann,1
5,0.9673(0.9169-1.0000),0.9412,0.9,0.8571,0.9688,0.878,catboost,1
6,0.9022(0.7789-0.9966),0.9529,0.9474,0.8571,0.9844,0.9,gbm,1
7,0.9550(0.8934-1.0000),0.9412,0.9444,0.8095,0.9844,0.8718,adaboost,1
8,0.8973(0.8114-0.9699),0.85,0.9524,0.7143,0.9688,0.8163,xgb,3
9,0.9051(0.8203-0.9726),0.85,0.9524,0.7143,0.9688,0.8163,rf,3


In [14]:
internal_results_df.to_excel("internal_results_df.xlsx", index=False)
external_results_df.to_excel("external_results_df.xlsx", index=False)

### 筛选后特征

In [4]:
selected_cols = ['PAX2', # 基因变量
 'age_first_diagnose',
 'behavioral_cognitive_abnormalities (1/0)',
 'cakut_subphenotype',
 'ckd_stage_first_diagnose',
 'congenital_heart_disease (1/0)',
 'family_history (1/0)',
 'gender (1/0)',
 'ocular (1/0)',
 'prenatal_phenotype (1/0)',
 'preterm_birth (1/0)',
 'short_stature (1/0)']

outcome_cols = ['esrd_1y', 'esrd_3y', 'esrd_5y']

In [5]:
selected_internal_results = []
selected_external_results = [] 

In [6]:
for year in [1,3,5]:
    for model_name, train_methods in model_dict.items():
        internal_X, internal_y = load_dataset(internal_dataset, year, selected_cols+outcome_cols)
        external_X, external_y = load_dataset(external_dataset, year, selected_cols+outcome_cols)
        
        print(year)
        X_train, X_test, y_train, y_test = train_test_split(internal_X, internal_y, test_size=0.3, random_state=42, stratify=internal_y)
        if model_name == 'gbdt':
            categorical_features = []
            for i, col in enumerate(X_train.columns):
                if col not in numerical_categories:
                    categorical_features.append(i)
            best_model = train_methods(X_train, y_train, categorical_features)
        else:
            best_model = train_methods(X_train, y_train)
        joblib.dump(best_model, f'./output/selected_cols/{model_name}_{year}yr.pkl')
        
        internal_metrics = eval_model(best_model, X_test, y_test)
        external_metrics = eval_model(best_model, external_X, external_y)
        internal_metrics['Model'] = model_name
        internal_metrics['Year'] = year
        external_metrics['Model'] = model_name
        external_metrics['Year'] = year
        selected_internal_results.append(internal_metrics)
        selected_external_results.append(external_metrics)
        print("*"*50)

NameError: name 'model_dict' is not defined

In [10]:
selected_internal_results_df = pd.DataFrame(selected_internal_results)
selected_external_results_df = pd.DataFrame(selected_external_results)

selected_internal_results_df.to_excel("selected_internal_results_df2.xlsx", index=False)
selected_external_results_df.to_excel("selected_external_results_df2.xlsx", index=False)

NameError: name 'selected_internal_results' is not defined

## 筛选后特征（无基因版）

In [27]:
selected_nogene_internal_results = []
selected_nogene_external_results = [] 

In [28]:
selected_cols = [#'PAX2', # 基因变量
 'age_first_diagnose',
 'behavioral_cognitive_abnormalities (1/0)',
 'cakut_subphenotype',
 'ckd_stage_first_diagnose',
 'congenital_heart_disease (1/0)',
 'family_history (1/0)',
 'gender (1/0)',
 'ocular (1/0)',
 'prenatal_phenotype (1/0)',
 'preterm_birth (1/0)',
 'short_stature (1/0)']

outcome_cols = ['esrd_1y', 'esrd_3y', 'esrd_5y']

In [29]:
for year in [1,3,5]:
    for model_name, train_methods in model_dict.items():
        internal_X, internal_y = load_dataset(internal_dataset, year, selected_cols+outcome_cols)
        external_X, external_y = load_dataset(external_dataset, year, selected_cols+outcome_cols)
        
        print(year)
        X_train, X_test, y_train, y_test = train_test_split(internal_X, internal_y, test_size=0.3, random_state=42, stratify=internal_y)
        if model_name == 'gbdt':
            categorical_features = []
            for i, col in enumerate(X_train.columns):
                if col not in numerical_categories:
                    categorical_features.append(i)
            best_model = train_methods(X_train, y_train, categorical_features)
        else:
            best_model = train_methods(X_train, y_train)
        joblib.dump(best_model, f'./output/no_gene/{model_name}_{year}yr.pkl')
        
        internal_metrics = eval_model(best_model, X_test, y_test)
        external_metrics = eval_model(best_model, external_X, external_y)
        internal_metrics['Model'] = model_name
        internal_metrics['Year'] = year
        external_metrics['Model'] = model_name
        external_metrics['Year'] = year
        selected_nogene_internal_results.append(internal_metrics)
        selected_nogene_external_results.append(external_metrics)
        print("*"*50)

1
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'border_count': 50, 'depth': 3, 'iterations': 50, 'l2_leaf_reg': 3, 'learning_rate': 0.01, 'subsample': 1.0}
AUC: 0.9920
0.9920(0.9745-1.0000)
Accuracy: 0.9660
Precision: 0.7333
Recall (Sensitivity): 0.9167
Specificity: 0.9704
F1 Score: 0.8148
AUC: 0.9695
0.9695(0.9269-0.9965)
Accuracy: 0.9412
Precision: 0.9000
Recall (Sensitivity): 0.8571
Specificity: 0.9688
F1 Score: 0.8780
**************************************************
1
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.1, 'max_depth': 7, 'max_features': 0.8, 'n_estimators': 50, 'subsample': 0.8}
AUC: 0.9907
0.9907(0.9694-1.0000)
Accuracy: 0.9796
Precision: 0.9091
Recall (Sensitivity): 0.8333
Specificity: 0.9926
F1 Score: 0.8696
AUC: 0.9435
0.9435(0.8460-0.9985)
Accuracy: 0.9412
Precision: 0.9444
Recall (Sensitivity): 0.8095
Specificity: 0.9844
F1 Score: 0.8718
**************************************************
1
Fitting 10 fold



{'estimator__max_depth': 1, 'learning_rate': 0.1, 'n_estimators': 100}
AUC: 0.9938
0.9938(0.9799-1.0000)
Accuracy: 0.9864
Precision: 1.0000
Recall (Sensitivity): 0.8333
Specificity: 1.0000
F1 Score: 0.9091
AUC: 0.9568
0.9568(0.8974-1.0000)
Accuracy: 0.9529
Precision: 1.0000
Recall (Sensitivity): 0.8095
Specificity: 1.0000
F1 Score: 0.8947
**************************************************
3
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'border_count': 50, 'depth': 3, 'iterations': 100, 'l2_leaf_reg': 10, 'learning_rate': 0.2, 'subsample': 1.0}
AUC: 0.9616
0.9616(0.9154-0.9945)
Accuracy: 0.9301
Precision: 0.6500
Recall (Sensitivity): 0.8125
Specificity: 0.9449
F1 Score: 0.7222
AUC: 0.9208
0.9208(0.8495-0.9789)
Accuracy: 0.8500
Precision: 0.8800
Recall (Sensitivity): 0.7857
Specificity: 0.9062
F1 Score: 0.8302
**************************************************
3
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.01, 'max_depth': 3, 'ma



{'estimator__max_depth': 1, 'learning_rate': 0.1, 'n_estimators': 50}
AUC: 0.9754
0.9754(0.9446-0.9978)
Accuracy: 0.9510
Precision: 1.0000
Recall (Sensitivity): 0.5625
Specificity: 1.0000
F1 Score: 0.7200
AUC: 0.8929
0.8929(0.7977-0.9736)
Accuracy: 0.8333
Precision: 1.0000
Recall (Sensitivity): 0.6429
Specificity: 1.0000
F1 Score: 0.7826
**************************************************
5
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'border_count': 100, 'depth': 3, 'iterations': 100, 'l2_leaf_reg': 10, 'learning_rate': 0.1, 'subsample': 0.8}
AUC: 0.9288
0.9288(0.8469-0.9831)
Accuracy: 0.8718
Precision: 0.6071
Recall (Sensitivity): 0.8095
Specificity: 0.8854
F1 Score: 0.6939
AUC: 0.9133
0.9133(0.8197-0.9852)
Accuracy: 0.8095
Precision: 0.8846
Recall (Sensitivity): 0.8214
Specificity: 0.7857
F1 Score: 0.8519
**************************************************
5
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.1, 'max_depth': 3, 'max



{'estimator__max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 100}
AUC: 0.9107
0.9107(0.8302-0.9686)
Accuracy: 0.8718
Precision: 0.6500
Recall (Sensitivity): 0.6190
Specificity: 0.9271
F1 Score: 0.6341
AUC: 0.9107
0.9107(0.7988-0.9963)
Accuracy: 0.8571
Precision: 0.9583
Recall (Sensitivity): 0.8214
Specificity: 0.9286
F1 Score: 0.8846
**************************************************


In [30]:
selected_nogene_internal_results_df = pd.DataFrame(selected_nogene_internal_results)
selected_nogene_external_results_df = pd.DataFrame(selected_nogene_external_results)

selected_nogene_internal_results_df.to_excel("selected_nogene_internal_results_df2.xlsx", index=False)
selected_nogene_external_results_df.to_excel("selected_nogene_external_results_df2.xlsx", index=False)

## 敏感性分析：总基因+总肾外

In [1]:
integrated_internal_dataset = './dataset/复旦儿科_135年_特征3.csv'
integrated_external_dataset = './dataset/外院_135年_特征3.csv'

In [5]:
cols3 = ['gender (1/0)', 'family_history (1/0)', 'preterm_birth (1/0)',
       'prenatal_phenotype (1/0)', 'cakut_subphenotype',
       'gene_trioplp (1/0)', 'age_first_diagnose',
       'ckd_stage_first_diagnose', 'short_stature (1/0)', 'hyperuricemia(1/0)',
       'extrarenal_anomalies (1/0)',
       'esrd_1y', 'esrd_3y', 'esrd_5y']

In [6]:
internal_results = []
external_results = [] 

In [9]:
for year in [1,3,5]:
    for model_name, train_methods in model_dict.items():
        if model_name not in ['rf', 'gbm']:
            continue
        internal_X, internal_y = load_dataset(integrated_internal_dataset, year, cols3)
        external_X, external_y = load_dataset(integrated_external_dataset, year, cols3)

        print(year)
        X_train, X_test, y_train, y_test = train_test_split(internal_X, internal_y, test_size=0.3, random_state=42, stratify=internal_y)
        best_model = train_methods(X_train, y_train)
        joblib.dump(best_model, f'./output/integrated/{model_name}_{year}yr.pkl')
        
        internal_metrics = eval_model(best_model, X_test, y_test)
        external_metrics = eval_model(best_model, external_X, external_y)
        internal_metrics['Model'] = model_name
        internal_metrics['Year'] = year
        external_metrics['Model'] = model_name
        external_metrics['Year'] = year
        internal_results.append(internal_metrics)
        external_results.append(external_metrics)
        print("*"*50)

1
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
**************************************************
1
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.01, 'max_depth': 7, 'max_features': 0.8, 'n_estimators': 100, 'subsample': 0.8}
**************************************************
3
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
{'bootstrap': False, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
**************************************************
3
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'learning_rate': 0.01, 'max_depth': 3, 'max_features': 0.8, 'n_estimators': 50, 'subsample': 0.8}
**************************************************
5
Fitting 10 folds for each of 216 candidates, totalling 2160 fits
{'b

In [11]:
integrated_internal_results_df = pd.DataFrame(internal_results)
integrated_external_results_df = pd.DataFrame(external_results)

integrated_internal_results_df.to_excel("integrated_internal_results_df.xlsx", index=False)
integrated_external_results_df.to_excel("integrated_external_results_df.xlsx", index=False)