In [6]:
import numpy as np
import pandas as pd
from IPython.display import display
import pickle
from tqdm import tqdm

from tensorflow.keras import models

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

## CFG

In [4]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/"

    n_folds=4
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged',
               'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

In [7]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

npz=np.load("../input/effnet_tuned_output.npz")
features_list=[npz[uid] for uid in tqdm(train["StudyInstanceUID"])]
features=np.array(features_list)

100%|██████████| 30083/30083 [01:09<00:00, 429.85it/s]


In [8]:
def get_fold(train):
    fold=train.copy()
    splitter=GroupKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,groups=train["PatientID"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

### AutoEncoderで次元削減する

In [9]:
def compress_with_autoencoder(features):
    scaler=pickle.load(open("./models/minmaxscaler_effnet_tuned.pickle","rb"))
    X=scaler.transform(features)

    autoencoder_dir=f"{CFG.models_dir}autoencoder_tuned/"
    with open(f"{autoencoder_dir}model.json","rt") as f:
        model_json=f.read()
    autoencoder=models.model_from_json(model_json)
    autoencoder.load_weights(f"{autoencoder_dir}ckpt")

    layer_name="dense_1"
    compressing_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

    ae_pred=compressing_model.predict(X)
    ae_pred_df=pd.DataFrame(ae_pred)

    return ae_pred_df

X=compress_with_autoencoder(features)



## パラメータを最適化する

In [11]:
target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X[train_idx],X[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

In [10]:
%%time

from optuna.integration import lightgbm as lgb

def optimize_params():
    for n,col_name in enumerate(CFG.target_cols):
        y_train_col,y_val_col=y_train[col_name],y_val[col_name]

        lgb_train=lgb.Dataset(X_train,label=y_train)
        lgb_val=lgb.Dataset(X_val,label=y_val,reference=lgb_train)
        
        params={
            "task":"train",
            "boosting_type":"gbdt",
            "objective":"binary",
            "metric":"auc",
            "learning_rate":0.01,
            "num_iterations":1000
            # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
        }

        opt=lgb.train(params,lgb_train,valid_sets=lgb_test, verbose_eval=100)
        pickle.dump(opt.params,open(f"{models_dir}lgbm_effnet_tuned/params_{n}.pickle","wb"))

optimize_params()

 further splits with positive gain, best gain: -inf
[600]	valid_0's auc: 0.844244
[700]	valid_0's auc: 0.844451
[800]	valid_0's auc: 0.845469
[900]	valid_0's auc: 0.84572
[1000]	valid_0's auc: 0.845459
min_data_in_leaf, val_score: 0.849742: 100%|##########| 5/5 [01:48<00:00, 22.13s/it][32m[I 2021-02-25 05:45:36,320][0m Trial 64 finished with value: 0.845458817598922 and parameters: {'min_child_samples': 50}. Best is trial 62 with value: 0.8479038975233555.[0m
min_data_in_leaf, val_score: 0.849742: 100%|##########| 5/5 [01:48<00:00, 21.79s/it]Wall time: 2h 16min 1s



### 最適化に要した時間
lr num_it, default: 19min 49s  
lr \*0.1, num_it \*10: 2h 16min 1s

## 得られたパラメータで予測する

In [16]:
import lightgbm


def get_pred(train,test,col_name:str):
    X_train=train.iloc[:,-CFG.num_features:]
    X_test=test.iloc[:,-CFG.num_features:]
    y_train=train[col_name]
    y_test=test[col_name]

    col_index=test.columns.get_loc(col_name)

    lgb_train=lightgbm.Dataset(X_train,label=y_train)
    lgb_test=lightgbm.Dataset(X_test,label=y_test,reference=lgb_train)

    params=pickle.load(open(f"{CFG.models_dir}autoencoder_smallLR/lgb_params_{col_index}.pickle","rb"))
    params["early_stopping_rounds"]=500

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=False)
    pred=model.predict(X_test)
    auc=roc_auc_score(y_test,pred)

    return pred,auc


results=pd.DataFrame(columns=CFG.target_cols)

for n in range(CFG.n_folds):
    print(f"\nfold - {n}")
    train_n=dataset[dataset["fold"]!=n]
    test_n=dataset[dataset["fold"]==n]


    for col_name in CFG.target_cols:
        pred,auc=get_pred(train=train_n,test=test_n,col_name=col_name)
        
        results.loc[f"fold - {n}",col_name]=auc


display(results)
display(results.mean(axis=1),results.mean(axis=1).mean())

 testing was 0.002528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20152
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.156679 -> initscore=-1.683146
[LightGBM] [Info] Start training from score -1.683146
[LightGBM] [Info] Number of positive: 2434, number of negative: 20128
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20152
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.107881 -> initscore=-2.112576
[LightGBM] [Info] Start training from score -2.112576
[LightGBM] [Info] Number of positive: 6405, number of negative: 16157
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20152
[LightGBM] [Info] Number of data

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold - 0,0.838635,0.818918,0.881725,0.760288,0.756089,0.840574,0.870719,0.615146,0.594002,0.558873,0.849742
fold - 1,0.780939,0.845924,0.869797,0.729937,0.755171,0.83639,0.872877,0.612742,0.569618,0.56103,0.852369
fold - 2,0.694071,0.814474,0.881718,0.761195,0.7463,0.856017,0.854002,0.605667,0.584258,0.545961,0.853164
fold - 3,0.783859,0.834552,0.899503,0.68053,0.750972,0.857201,0.872215,0.603365,0.589795,0.555042,0.858685


fold - 0    0.762246
fold - 1    0.753345
fold - 2    0.745166
fold - 3    0.753247
dtype: float64

0.7535011187053615

## モデルを保存する

In [9]:
import lightgbm


for i,col_name in enumerate(CFG.target_cols):
    y_train=dataset_train.loc[:,col_name]
    y_test=dataset_test.loc[:,col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train)
    lgb_test=lightgbm.Dataset(X_test,label=y_test,reference=lgb_train)
    
    params=pickle.load(open(f"{CFG.models_dir}autoencoder_smallLR/lgb_params_{i+1}.pickle","rb"))    
    params["early_stopping_rounds"]=500

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=False)
    pickle.dump(model,open(f"{CFG.models_dir}autoencoder_smallLR/lgb_model_{i+1}.pickle","wb"))

[LightGBM] [Info] Number of positive: 61, number of negative: 22501
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20151
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002704 -> initscore=-5.910441
[LightGBM] [Info] Start training from score -5.910441
[LightGBM] [Info] Number of positive: 875, number of negative: 21687
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20151
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038782 -> initscore=-3.210244
[LightGBM] [Info] Start training from score -3.210244
[LightGBM] [Info] Number of positive: 5351, number of negative: 17211
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20151
[LightGBM] [Info] Number of data points in the train set: 22