In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import pickle
from tqdm import tqdm

from tensorflow.keras import models

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

## CFG

In [2]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/"

    n_folds=4
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged',
               'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

In [3]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

npz=np.load("../input/effnet_tuned_output.npz")
features_list=[npz[uid] for uid in tqdm(train["StudyInstanceUID"])]
features=np.array(features_list)

100%|██████████| 30083/30083 [00:51<00:00, 580.93it/s]


In [4]:
def get_fold(train):
    fold=train.copy()
    splitter=GroupKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,groups=train["PatientID"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

### AutoEncoderで次元削減する

In [5]:
def compress_with_autoencoder(features):
    scaler=pickle.load(open("./models/minmaxscaler_effnet_tuned.pickle","rb"))
    X=scaler.transform(features)

    autoencoder_dir=f"{CFG.models_dir}autoencoder_tuned/"
    with open(f"{autoencoder_dir}model.json","rt") as f:
        model_json=f.read()
    autoencoder=models.model_from_json(model_json)
    autoencoder.load_weights(f"{autoencoder_dir}ckpt")

    layer_name="dense_1"
    compressing_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

    ae_pred=compressing_model.predict(X)
    ae_pred_df=pd.DataFrame(ae_pred)

    return ae_pred_df

X=compress_with_autoencoder(features)



## パラメータを最適化する

In [6]:
target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X[train_idx],X[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

In [10]:
%%time

from optuna.integration import lightgbm as lgb

def optimize_params():
    for n,col_name in enumerate(CFG.target_cols):
        y_train_col,y_val_col=y_train[col_name],y_val[col_name]

        lgb_train=lgb.Dataset(X_train,label=y_train_col)
        lgb_val=lgb.Dataset(X_val,label=y_val_col,reference=lgb_train)
        
        params={
            "task":"train",
            "boosting_type":"gbdt",
            "objective":"binary",
            "metric":"auc",
            "learning_rate":0.1,
            "num_iterations":100
            # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
        }

        opt=lgb.train(params,lgb_train,valid_sets=lgb_val, verbose_eval=100)
        pickle.dump(opt.params,open(f"{CFG.models_dir}lgbm_effnet_tuned/params_{n}.pickle","wb"))

optimize_params()

es: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026460 -> initscore=-3.605288
[LightGBM] [Info] Start training from score -3.605288
[100]	valid_0's auc: 0.997153
regularization_factors, val_score: 0.997154:  20%|##        | 4/20 [00:01<00:07,  2.19it/s][32m[I 2021-03-10 00:08:01,672][0m Trial 46 finished with value: 0.9971530012296067 and parameters: {'lambda_l1': 0.023274875560142644, 'lambda_l2': 1.4526482716131758e-06}. Best is trial 43 with value: 0.9971541790137707.[0m
regularization_factors, val_score: 0.997154:  20%|##        | 4/20 [00:01<00:07,  2.19it/s][LightGBM] [Info] Number of positive: 597, number of negative: 21965
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18931
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026460 -> initscore=-3.605288
[LightGBM] [Info] Start training from score -3.605288
[100]	valid_0's auc: 0.99698


### 最適化に要した時間
lr num_it, default: 19min 49s  
lr \*0.1, num_it \*10: 2h 16min 1s  
lr num_it,default: 14min 26s

## 得られたパラメータで予測する

In [12]:
import lightgbm

num_features=100

def get_pred(train,val,col_idx:int):
    X_train,y_train=train
    X_val,y_val=val
    col_name=CFG.target_cols[col_idx]
    y_train_col,y_val_col=y_train[col_name],y_val[col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train_col)
    lgb_test=lightgbm.Dataset(X_val,label=y_val_col,reference=lgb_train)

    params=pickle.load(open(f"{CFG.models_dir}lgbm_effnet_tuned/params_{col_idx}.pickle","rb"))
    params["early_stopping_rounds"]=1000

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=False)
    pred=model.predict(X_val)
    auc=roc_auc_score(y_val_col,pred)

    return pred,auc


results=pd.DataFrame(columns=CFG.target_cols)

for n in range(CFG.n_folds):
    print(f"\nfold - {n}")
    train_idx=(fold["folds"]!=n)
    val_idx=(fold["folds"]==n)
    X_train,X_val=X[train_idx],X[val_idx]
    y_train,y_val=train[train_idx],train[val_idx] 

    for col_idx,col_name in enumerate(CFG.target_cols):
        _,auc=get_pred(train=(X_train,y_train),val=(X_val,y_val),col_idx=col_idx)
        results.loc[f"fold - {n}",col_name]=auc


display(results)
display(results.mean(axis=1),results.mean(axis=1).mean())

lits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 903, number of negative: 21660
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18933
[LightGBM] [Info] Number of data points in the train set: 22563, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040021 -> initscore=-3.177500
[LightGBM] [Info] Start training from score -3.177500
[LightGBM] [Info] Number of positive: 5434, number of negative: 17129
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18933
[LightGBM] [Info] Number of data points in the train set: 22563, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240837 -> initscore=-1.148097
[LightGBM] [Info] Start training from score -1.148097
[LightGBM] [Info] Number of positive: 225, number of negative: 22338
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18933
[LightGBM] [Info] 

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold - 0,0.954811,0.956827,0.991526,0.91077,0.89723,0.971671,0.966175,0.808987,0.791711,0.833054,0.997171
fold - 1,0.92339,0.955895,0.989013,0.887442,0.884119,0.964834,0.96687,0.830941,0.774067,0.839155,0.998866
fold - 2,0.967235,0.959108,0.991139,0.90401,0.888132,0.974249,0.967909,0.817232,0.792888,0.829088,0.9993
fold - 3,0.953951,0.956494,0.992525,0.872001,0.878282,0.970925,0.964722,0.811299,0.775994,0.83158,0.998721


fold - 0    0.916358
fold - 1    0.910418
fold - 2    0.917299
fold - 3    0.909681
dtype: float64

0.913438849473597

## モデルを保存する

In [9]:
import lightgbm

target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X[train_idx],X[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

for i,col_name in enumerate(CFG.target_cols):
    y_train_col=y_train[col_name]
    y_val_col=y_val[col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train_col)
    lgb_val=lightgbm.Dataset(X_val,label=y_val_col,reference=lgb_train)
    
    params=pickle.load(open(f"{CFG.models_dir}lgbm_effnet_tuned/params_{i}.pickle","rb"))    
    params["early_stopping_rounds"]=500

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_val,verbose_eval=False)
    pickle.dump(model,open(f"{CFG.models_dir}lgbm_effnet_tuned/model_{i}.pickle","wb"))

[LightGBM] [Info] Number of positive: 61, number of negative: 22501
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20151
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002704 -> initscore=-5.910441
[LightGBM] [Info] Start training from score -5.910441
[LightGBM] [Info] Number of positive: 875, number of negative: 21687
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20151
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038782 -> initscore=-3.210244
[LightGBM] [Info] Start training from score -3.210244
[LightGBM] [Info] Number of positive: 5351, number of negative: 17211
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20151
[LightGBM] [Info] Number of data points in the train set: 22