In [23]:
import pandas as pd
import numpy as np
import pickle
from IPython.display import display
import optuna
from tqdm.notebook import tqdm

from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras import callbacks

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## CFG

In [6]:
class CFG:
    debug=True

    input_dir="../input/efficientnet_output_straight/"
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/" if debug else "../input/efficientnet-lightgbm-models/"

    n_folds=4
    num_features=100
    epochs=20 if debug else 60
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged',           'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

In [7]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

group_kfold=GroupKFold(n_splits=CFG.n_folds)
folds=train.copy()

for n,(train_idx,val_idx) in enumerate(group_kfold.split(folds,groups=folds["PatientID"].values)):
    folds.loc[val_idx,"fold"]=n

folds["fold"]=folds["fold"].astype(int)
display(folds)

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present,PatientID,fold
0,1.2.826.0.1.3680043.8.498.26697628953273228189...,0,0,0,0,0,0,1,0,0,0,0,ec89415d1,2
1,1.2.826.0.1.3680043.8.498.46302891597398758759...,0,0,1,0,0,1,0,0,0,1,0,bf4c6da3c,2
2,1.2.826.0.1.3680043.8.498.23819260719748494858...,0,0,0,0,0,0,0,0,1,0,0,3fc1c97e5,0
3,1.2.826.0.1.3680043.8.498.68286643202323212801...,0,0,0,0,0,0,0,1,0,0,0,c31019814,0
4,1.2.826.0.1.3680043.8.498.10050203009225938259...,0,0,0,0,0,0,0,0,0,1,0,207685cd1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.2.826.0.1.3680043.8.498.74257566841157531124...,0,0,1,0,0,0,0,0,1,1,0,5b5b9ac30,2
30079,1.2.826.0.1.3680043.8.498.46510939987173529969...,0,0,0,0,0,0,0,0,0,1,0,7192404d8,2
30080,1.2.826.0.1.3680043.8.498.43173270582850645437...,0,0,1,0,0,1,0,1,0,1,0,d4d1b066d,3
30081,1.2.826.0.1.3680043.8.498.95092491950130838685...,0,0,0,0,0,0,0,0,1,0,0,01a6602b8,1


## LightGBMによる予測値の準備

In [8]:
input_df=pd.read_csv(f"{CFG.models_dir}efficientnet_output_normalized.csv")
display(input_df)

Unnamed: 0,StudyInstanceUID,PatientID,0,1,2,3,4,5,6,7,...,2550,2551,2552,2553,2554,2555,2556,2557,2558,2559
0,1.2.826.0.1.3680043.8.498.26697628953273228189...,ec89415d1,0.169359,0.324016,0.173553,0.095382,0.063119,0.165738,0.181906,0.437814,...,0.302277,0.070963,0.354162,0.284205,0.057408,0.101549,0.200983,0.098618,0.276763,0.473490
1,1.2.826.0.1.3680043.8.498.46302891597398758759...,bf4c6da3c,0.365546,0.175262,0.286057,0.205560,0.388747,0.452293,0.021495,0.460131,...,0.232927,0.223938,0.135535,0.042117,0.169312,0.383767,0.095033,0.151809,0.048868,0.117311
2,1.2.826.0.1.3680043.8.498.23819260719748494858...,3fc1c97e5,0.211337,0.303350,0.184402,0.248883,0.202594,0.301177,0.208046,0.416033,...,0.370746,0.315548,0.384991,0.181793,0.134341,0.147335,0.313144,0.356484,0.112562,0.404958
3,1.2.826.0.1.3680043.8.498.68286643202323212801...,c31019814,0.520994,0.385111,0.367907,0.247489,0.097945,0.448468,0.377076,0.108094,...,0.355882,0.156018,0.316916,0.141020,0.041345,0.297808,0.273713,0.329452,0.203884,0.219169
4,1.2.826.0.1.3680043.8.498.10050203009225938259...,207685cd1,0.383775,0.214603,0.263212,0.429333,0.619226,0.365313,0.264323,0.201577,...,0.302169,0.318546,0.097627,0.337127,0.124034,0.266738,0.595066,0.150074,0.454537,0.071115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.2.826.0.1.3680043.8.498.74257566841157531124...,5b5b9ac30,0.152407,0.273989,0.429204,0.095627,0.143194,0.151159,0.190642,0.151711,...,0.417215,0.330231,0.257251,0.193722,0.070763,0.415085,0.388353,0.156808,0.239679,0.161568
30079,1.2.826.0.1.3680043.8.498.46510939987173529969...,7192404d8,0.303968,0.299640,0.232293,0.115276,0.309395,0.338309,0.164800,0.221858,...,0.509008,0.143263,0.214838,0.167794,0.075009,0.163606,0.415589,0.059742,0.148673,0.164074
30080,1.2.826.0.1.3680043.8.498.43173270582850645437...,d4d1b066d,0.316939,0.351076,0.589629,0.291154,0.185748,0.232997,0.113028,0.353113,...,0.542412,0.236196,0.170918,0.283692,0.270579,0.275420,0.285688,0.203561,0.118533,0.222272
30081,1.2.826.0.1.3680043.8.498.95092491950130838685...,01a6602b8,0.239194,0.393650,0.438603,0.363903,0.144577,0.259132,0.366278,0.193645,...,0.387640,0.075916,0.316283,0.119015,0.101108,0.645644,0.305596,0.319096,0.330187,0.278469


### AutoEncoderによる次元削減 

In [9]:
autoencoder=models.load_model("./models/autoencoder_splits10/")

layer_name="dense_1"
hidden_layer_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

pred=hidden_layer_model.predict(input_df.iloc[:,2:])
features=pd.concat([input_df["StudyInstanceUID"],pd.DataFrame(pred)],axis=1)

### LightGBMによる推論

In [10]:
lgb_pred=folds.copy()

for i,col_name in enumerate(CFG.target_cols):
    model=pickle.load(open(f"{CFG.models_dir}autoencoder_smallLR/lgb_model_{i+1}.pickle","rb"))
    pred=model.predict(features.iloc[:,1:])
    lgb_pred.loc[:,col_name]=pred

display(lgb_pred)

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present,PatientID,fold
0,1.2.826.0.1.3680043.8.498.26697628953273228189...,2.851887e-06,0.014421,0.257733,0.007643,0.020949,0.003159,0.854033,0.090729,0.284922,0.473831,0.000033,ec89415d1,2
1,1.2.826.0.1.3680043.8.498.46302891597398758759...,6.990330e-05,0.079135,0.734699,0.016351,0.021119,0.960592,0.229085,0.063781,0.358774,0.788177,0.000065,bf4c6da3c,2
2,1.2.826.0.1.3680043.8.498.23819260719748494858...,4.841511e-06,0.003495,0.051492,0.003134,0.013448,0.000845,0.026849,0.070780,0.357099,0.720060,0.000022,3fc1c97e5,0
3,1.2.826.0.1.3680043.8.498.68286643202323212801...,5.313002e-07,0.002256,0.003886,0.002061,0.004724,0.000523,0.003150,0.260812,0.236471,0.612888,0.000008,c31019814,0
4,1.2.826.0.1.3680043.8.498.10050203009225938259...,3.812713e-06,0.018148,0.134492,0.015732,0.015907,0.007980,0.100083,0.059859,0.283469,0.807261,0.000075,207685cd1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.2.826.0.1.3680043.8.498.74257566841157531124...,6.560168e-06,0.029976,0.496119,0.009887,0.008641,0.005662,0.062989,0.059829,0.570539,0.764016,0.000237,5b5b9ac30,2
30079,1.2.826.0.1.3680043.8.498.46510939987173529969...,7.192546e-06,0.022047,0.110283,0.004289,0.007334,0.003377,0.046028,0.060357,0.350258,0.838486,0.000081,7192404d8,2
30080,1.2.826.0.1.3680043.8.498.43173270582850645437...,2.784264e-06,0.031739,0.556671,0.004088,0.032469,0.948585,0.088642,0.544563,0.287543,0.790466,0.000191,d4d1b066d,3
30081,1.2.826.0.1.3680043.8.498.95092491950130838685...,5.331973e-07,0.000567,0.004966,0.001784,0.002526,0.000255,0.001768,0.046502,0.191689,0.586526,0.000008,01a6602b8,1


## NNによる予測値の準備

In [11]:
input_df=pd.read_csv(f"{CFG.models_dir}efficientnet_output.csv")
display(input_df)

Unnamed: 0,StudyInstanceUID,0,1,2,3,4,5,6,7,8,...,2550,2551,2552,2553,2554,2555,2556,2557,2558,2559
0,1.2.826.0.1.3680043.8.498.26697628953273228189...,-0.112231,0.245513,-0.033396,-0.191451,-0.199094,-0.004325,-0.083669,0.191719,-0.123753,...,0.190715,-0.178776,0.320377,-0.041465,-0.231193,-0.159202,0.118353,-0.183021,0.112336,0.497603
1,1.2.826.0.1.3680043.8.498.46302891597398758759...,0.067357,0.011641,0.100913,-0.112439,0.140193,0.442245,-0.244318,0.214603,0.095342,...,0.089758,-0.004185,-0.029220,-0.230071,-0.154650,0.121530,-0.066817,-0.139586,-0.190869,-0.071105
2,1.2.826.0.1.3680043.8.498.23819260719748494858...,-0.073804,0.213021,-0.020444,-0.081371,-0.053769,0.206744,-0.057491,0.169386,0.163935,...,0.290390,0.100370,0.369674,-0.121252,-0.178570,-0.113657,0.314377,0.027548,-0.106127,0.388178
3,1.2.826.0.1.3680043.8.498.68286643202323212801...,0.209653,0.341568,0.198626,-0.082370,-0.162807,0.436283,0.111789,-0.146370,0.390531,...,0.268752,-0.081703,0.260819,-0.153018,-0.242180,0.036024,0.245464,0.005475,0.015372,0.091531
4,1.2.826.0.1.3680043.8.498.10050203009225938259...,0.084045,0.073493,0.073640,0.048036,0.380339,0.306694,-0.001131,-0.050514,0.304453,...,0.190559,0.103791,-0.089837,-0.000235,-0.185620,0.005118,0.807094,-0.141003,0.348858,-0.144865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.2.826.0.1.3680043.8.498.74257566841157531124...,-0.127748,0.166860,0.271803,-0.191275,-0.115660,-0.027046,-0.074921,-0.101646,0.231501,...,0.358037,0.117128,0.165411,-0.111959,-0.222058,0.152683,0.445822,-0.135504,0.062997,-0.000440
30079,1.2.826.0.1.3680043.8.498.46510939987173529969...,0.010990,0.207189,0.036729,-0.177184,0.057512,0.264610,-0.100801,-0.029718,0.151412,...,0.491667,-0.096260,0.097591,-0.132159,-0.219154,-0.097472,0.493421,-0.214767,-0.058083,0.003562
30080,1.2.826.0.1.3680043.8.498.43173270582850645437...,0.022863,0.288057,0.463321,-0.051057,-0.071321,0.100492,-0.152650,0.104869,0.151835,...,0.540294,0.009805,0.027360,-0.041865,-0.085383,0.013754,0.266392,-0.097326,-0.098183,0.096486
30081,1.2.826.0.1.3680043.8.498.95092491950130838685...,-0.048304,0.354992,0.283024,0.001114,-0.114220,0.141220,0.100976,-0.058647,0.230701,...,0.314984,-0.173124,0.259807,-0.170161,-0.201301,0.382029,0.301187,-0.002982,0.183414,0.186215


### Dense層による推論

In [12]:
with open(f"{CFG.models_dir}eff_dense/model_structure","rt") as f:
    model_json_str=f.read()

dense_model=models.model_from_json(model_json_str)
dense_model.load_weights(f"{CFG.models_dir}eff_dense/checkpoint")
dense_model.compile(optimizer="adam",loss="binary_crossentropy",metrics=[keras.metrics.AUC(multi_label=True)])
dense_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 2560)              0         
_________________________________________________________________
dense (Dense)                (None, 11)                28171     
Total params: 28,171
Trainable params: 28,171
Non-trainable params: 0
_________________________________________________________________


In [13]:
dense_pred=pd.DataFrame(dense_model.predict(input_df.iloc[:,1:]),columns=CFG.target_cols)
display(dense_pred)

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
0,0.004059,0.021637,0.385945,0.003532,0.025626,0.043735,0.319992,0.091947,0.262037,0.622860,0.017982
1,0.089757,0.301640,0.845686,0.053442,0.037303,0.485664,0.692929,0.144457,0.406314,0.576784,0.176354
2,0.000624,0.006306,0.023352,0.003461,0.002433,0.014470,0.012772,0.090007,0.371517,0.619742,0.002166
3,0.000147,0.000722,0.000731,0.000574,0.000388,0.003289,0.000765,0.165605,0.168221,0.626359,0.000445
4,0.004794,0.050021,0.352987,0.024274,0.021685,0.176899,0.425405,0.052734,0.280874,0.780879,0.005260
...,...,...,...,...,...,...,...,...,...,...,...
30078,0.015975,0.062557,0.365578,0.002093,0.010424,0.172909,0.103522,0.082312,0.415560,0.655318,0.051722
30079,0.002924,0.032515,0.264873,0.002657,0.008054,0.190094,0.063001,0.061255,0.295826,0.710416,0.060661
30080,0.019645,0.063237,0.545217,0.010459,0.043144,0.109558,0.402396,0.258593,0.379614,0.575094,0.019723
30081,0.000312,0.000754,0.007716,0.000451,0.001699,0.006645,0.002124,0.063714,0.107356,0.720578,0.002360


## アンサンブルさせる

NNではすぐに過学習してauc=1.00となり学習が停滞する

In [14]:
scaler=StandardScaler()

# pickle.dump(scaler.fit(lgb_pred[CFG.target_cols]),open(f"{CFG.models_dir}standardscaler_lgb.pickle","wb"))
# pickle.dump(scaler.fit(dense_pred[CFG.target_cols]),open(f"{CFG.models_dir}standardscaler_dense.pickle","wb"))


lgb_pred_norm=pd.DataFrame(scaler.fit_transform(lgb_pred[CFG.target_cols]),columns=CFG.target_cols)
dense_pred_norm=pd.DataFrame(scaler.fit_transform(dense_pred[CFG.target_cols]),columns=CFG.target_cols)

lgb_pred_norm=pd.concat([lgb_pred_norm,folds["fold"]],axis=1)
dense_pred_norm=pd.concat([dense_pred_norm,folds["fold"]],axis=1)

In [15]:
fold=0

train_lgb=lgb_pred_norm[folds["fold"]!=fold]
val_lgb=lgb_pred_norm[folds["fold"]==fold]

train_dense=dense_pred_norm[folds["fold"]!=fold]
val_dense=dense_pred_norm[folds["fold"]==fold]

train_target=train[folds["fold"]!=fold]
val_target=train[folds["fold"]==fold]

In [28]:
def create_model():
    ensemble_nn=keras.Sequential([keras.layers.Dense(1,input_shape=(2,),activation="sigmoid")])
    adam=keras.optimizers.Adam(learning_rate=1e-3)
    ensemble_nn.compile(optimizer=adam,loss="binary_crossentropy",metrics=[keras.metrics.AUC(name="auc")])

    return ensemble_nn

lr_reducer=callbacks.ReduceLROnPlateau(monitor="val_auc",patience=5,verbose=1,mode="max",min_lr=1e-6,factor=0.5)

for col_name in CFG.target_cols:
    train_input=pd.concat([train_lgb[col_name],train_dense[col_name]],axis=1)
    val_input=pd.concat([val_lgb[col_name],val_dense[col_name]],axis=1)
    model=create_model()
    model.fit(x=train_input,y=train_target[col_name],epochs=CFG.epochs,
        validation_data=(val_input,val_target[col_name]),verbose=1,callbacks=[lr_reducer])
    score=model.evaluate(x=val_input,y=val_target[col_name],verbose=0)
    print(col_name,score[1])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 19/20
Epoch 20/20
ETT - Abnormal 0.4998667240142822
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
ETT - Borderline 0.64264

KeyboardInterrupt: 

In [13]:
def objective(trial):

    results=pd.DataFrame(index=["AUC"],columns=CFG.target_cols)


    logi_c=trial.suggest_int("logi_c",1,1e10,log=True)
    clf=LogisticRegression(random_state=0,C=logi_c)


    for col_name in CFG.target_cols:

        train_input=pd.concat([train_lgb[col_name],train_dense[col_name]],axis=1)
        val_input=pd.concat([val_lgb[col_name],val_dense[col_name]],axis=1)

        
        clf.fit(train_input,train_target[col_name])

        ensemble_pred=clf.predict_proba(val_input)[:,1]
        score=roc_auc_score(val_target[col_name],ensemble_pred)
        
        results.loc["AUC",col_name]=score

    return results.mean(axis=1)


"""
study=optuna.create_study()
study.optimize(objective,n_trials=200)
print(study.best_params,study.best_value)
"""

'\nstudy=optuna.create_study()\nstudy.optimize(objective,n_trials=200)\nprint(study.best_params,study.best_value)\n'

### 重み付け和

In [31]:
results=pd.DataFrame(columns=CFG.target_cols)

def get_best_params(train_lgb,train_dense,train_target):
    best_params={}
    for col_name in CFG.target_cols:

        best_value=(0,0)

        for p in range(101):
            p*=0.01

            ensemble_pred=train_lgb[col_name]*p+train_dense[col_name]*(1-p)
            score=roc_auc_score(train_target[col_name],ensemble_pred)

            if score>best_value[1]:
                best_value=(p,score)

        best_params[col_name]=best_value[0]
    
    return best_params

oof_best_params=pd.DataFrame(columns=CFG.target_cols)

for n in range(CFG.n_folds):
    train_lgb=lgb_pred_norm[folds["fold"]!=n]
    train_dense=dense_pred_norm[folds["fold"]!=n]
    train_target=train[folds["fold"]!=n]

    best_params=get_best_params(train_lgb,train_dense,train_target)

    for col_name,p in best_params.items():
        oof_best_params.loc[f"fold {n}",col_name]=p

    val_lgb=lgb_pred_norm[folds["fold"]==n]
    val_dense=dense_pred_norm[folds["fold"]==n]
    val_target=train[folds["fold"]==n]

    for col_name in CFG.target_cols:
        p=best_params[col_name]
        ensemble_pred=val_lgb[col_name]*p+val_dense[col_name]*(1-p)

        score=roc_auc_score(val_target[col_name],ensemble_pred)
        results.loc[f"fold {n}",col_name]=score

display(results,results.mean(axis=1),oof_best_params,oof_best_params.mean(axis=0))

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold 0,0.832852,0.818918,0.881725,0.760288,0.756089,0.854987,0.870719,0.615146,0.594002,0.558873,0.882527
fold 1,1.0,1.0,0.995519,0.992711,0.990708,1.0,0.999965,0.999809,0.881025,0.999356,1.0
fold 2,1.0,0.999985,0.995749,0.999462,0.987794,1.0,0.999829,0.999773,0.888506,0.999477,1.0
fold 3,1.0,1.0,0.997561,0.999425,0.986358,1.0,0.999979,0.999493,0.881555,0.999528,1.0


fold 0    0.766011
fold 1    0.987190
fold 2    0.988234
fold 3    0.987627
dtype: float64

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold 0,0.67,1.0,1.0,1.0,1.0,0.65,1.0,1.0,1.0,1.0,0.59
fold 1,1.0,0.93,1.0,0.73,1.0,1.0,1.0,1.0,1.0,1.0,0.99
fold 2,1.0,0.94,1.0,0.73,0.99,1.0,1.0,1.0,1.0,1.0,0.99
fold 3,0.99,0.95,1.0,0.75,1.0,1.0,1.0,1.0,1.0,1.0,0.99


ETT - Abnormal                0.9150
ETT - Borderline              0.9550
ETT - Normal                  1.0000
NGT - Abnormal                0.8025
NGT - Borderline              0.9975
NGT - Incompletely Imaged     0.9125
NGT - Normal                  1.0000
CVC - Abnormal                1.0000
CVC - Borderline              1.0000
CVC - Normal                  1.0000
Swan Ganz Catheter Present    0.8900
dtype: float64

In [32]:
best_params_oof_ave={
    "ETT - Abnormal"                :0.9150,
    "ETT - Borderline"              :0.9550,
    "ETT - Normal"                  :1.0000,
    "NGT - Abnormal"                :0.8025,
    "NGT - Borderline"              :0.9975,
    "NGT - Incompletely Imaged"     :0.9125,
    "NGT - Normal"                  :1.0000,
    "CVC - Abnormal"                :1.0000,
    "CVC - Borderline"              :1.0000,
    "CVC - Normal"                  :1.0000,
    "Swan Ganz Catheter Present"    :0.8900
}

pickle.dump(best_params_oof_ave,open(f"{CFG.models_dir}weightedsum_params_oof_ave.pickle","wb"))

### パラメータの最適化

In [14]:
%%time

from optuna.integration import lightgbm as lgb

def optimize_params():

    for i,col_name in enumerate(CFG.target_cols):
        train_input=pd.concat([train_lgb[col_name],train_dense[col_name]],axis=1)
        val_input=pd.concat([val_lgb[col_name],val_dense[col_name]],axis=1)

        train_input.columns=range(train_input.shape[1])
        val_input.columns=range(val_input.shape[1])

        lgb_train=lgb.Dataset(train_input,label=train_target[col_name])
        lgb_test=lgb.Dataset(val_input,label=val_target[col_name],reference=lgb_train)
        
        params={
            "task":"train",
            "boosting_type":"gbdt",
            "objective":"binary",
            "metric":"auc",
            "learning_rate":1e-1,
            "num_iterations":500
            # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
        }

        opt=lgb.train(params,lgb_train,valid_sets=lgb_test, verbose_eval=False)
        pickle.dump(opt.params,open(f"{CFG.models_dir}ensemble_lgb/lgb_params_{i}.pickle","wb"))

optimize_params()

th positive gain, best gain: -inf
min_data_in_leaf, val_score: 0.850004: 100%|##########| 5/5 [00:03<00:00,  1.32it/s][32m[I 2021-02-27 19:28:10,471][0m Trial 67 finished with value: 0.8499879866015273 and parameters: {'min_child_samples': 25}. Best is trial 64 with value: 0.8500035922417002.[0m
min_data_in_leaf, val_score: 0.850004: 100%|##########| 5/5 [00:03<00:00,  1.31it/s]Stopped training because there are no more leaves that meet the split requirements
Wall time: 10min 54s



### パラメータからモデルの構築

In [15]:
%%time

import lightgbm


for i,col_name in enumerate(CFG.target_cols):
    train_input=pd.concat([train_lgb[col_name],train_dense[col_name]],axis=1)
    val_input=pd.concat([val_lgb[col_name],val_dense[col_name]],axis=1)

    train_input.columns=range(train_input.shape[1])
    val_input.columns=range(val_input.shape[1])

    lgb_train=lgb.Dataset(train_input,label=train_target[col_name])
    lgb_test=lgb.Dataset(val_input,label=val_target[col_name],reference=lgb_train)

    target_model_dir=f"{CFG.models_dir}ensemble_lgb"
    
    params=pickle.load(open(f"{target_model_dir}/lgb_params_{i}.pickle","rb"))
    params["early_stopping_rounds"]=500

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=False)
    pickle.dump(model,open(f"{target_model_dir}/lgb_model_{i}.pickle","wb"))

in: -inf
Wall time: 8.28 s


In [17]:
results=pd.DataFrame(columns=CFG.target_cols)

for n in range(CFG.n_folds):
    val_lgb=lgb_pred_norm[folds["fold"]==n]
    val_dense=dense_pred_norm[folds["fold"]==n]
    val_target=train[folds["fold"]==n]


    for i,col_name in enumerate(CFG.target_cols):
        val_input=pd.concat([val_lgb[col_name],val_dense[col_name]],axis=1)
        
        model=pickle.load(open(f"{CFG.models_dir}ensemble_lgb/lgb_model_{i}.pickle","rb"))
        score=roc_auc_score(val_target[col_name],model.predict(val_input))
        results.loc[f"fold - {n}",col_name]=score

display(results,results.mean(axis=1))

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold - 0,0.880355,0.816767,0.87887,0.655587,0.743923,0.834016,0.808072,0.615765,0.594139,0.553794,0.854552
fold - 1,1.0,1.0,0.995628,0.997398,0.992063,1.0,0.999962,0.999775,0.882005,0.999421,1.0
fold - 2,1.0,0.999973,0.995906,0.999814,0.98954,1.0,0.999843,0.999826,0.889521,0.999494,1.0
fold - 3,1.0,0.999973,0.997586,0.999788,0.990172,1.0,0.999975,0.999769,0.883052,0.999556,1.0


fold - 0    0.748713
fold - 1    0.987841
fold - 2    0.988538
fold - 3    0.988170
dtype: float64

### RandomForest
{'rf_max_depth': 4, 'rf_n_estimators': 300} 0.7601442329875496

### LightGBM
lr 0.001, itr 1000: 0.752988 (23min 8s)  
lr 0.1, itr 200: 0.756288 (6min 58s)  
lr 1e-2, itr 1e3: 0.754762 (24min 22s)  
lr 1e-1, itr 500: 0.748713 (10min 54s)  

### LogisticRegression
{'logi_c': 50} 0.6739926270642816

### Weighted Sum
learning_rate 0.01: 0.766011