In [38]:
import numpy as np
import pandas as pd
from IPython.display import display
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt

from tensorflow.keras import models

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

## CFG

In [2]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/"

    n_folds=4
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged',
               'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

In [3]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

npz=np.load("../input/effnet_tuned_output.npz")
features_list=[npz[uid] for uid in tqdm(train["StudyInstanceUID"])]
features=np.array(features_list)

100%|██████████| 30083/30083 [00:51<00:00, 580.93it/s]


In [4]:
def get_fold(train):
    fold=train.copy()
    splitter=GroupKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,groups=train["PatientID"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

### AutoEncoderで次元削減する

In [5]:
def compress_with_autoencoder(features):
    scaler=pickle.load(open("./models/minmaxscaler_effnet_tuned.pickle","rb"))
    X=scaler.transform(features)

    autoencoder_dir=f"{CFG.models_dir}autoencoder_tuned/"
    with open(f"{autoencoder_dir}model.json","rt") as f:
        model_json=f.read()
    autoencoder=models.model_from_json(model_json)
    autoencoder.load_weights(f"{autoencoder_dir}ckpt")

    layer_name="dense_1"
    compressing_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

    ae_pred=compressing_model.predict(X)
    ae_pred_df=pd.DataFrame(ae_pred)

    return ae_pred_df

X=compress_with_autoencoder(features)



In [67]:
valuless_columns=[4,  7,  9, 12, 13, 20, 25, 27, 30, 31, 36, 44, 47, 48, 51, 54, 64, 65, 71, 73, 74, 75, 89, 92, 97]
X_dropped=X.drop(columns=valuless_columns)
display(X_dropped)

Unnamed: 0,0,1,2,3,5,6,8,10,11,14,...,87,88,90,91,93,94,95,96,98,99
0,1.843573,0.762214,0.869982,1.386810,1.369750,0.943624,0.868235,0.0,2.362543,1.963967,...,0.908345,1.251791,1.660741,0.820461,1.360656,2.405944,1.016294,0.000000,1.393855,0.974065
1,1.316493,1.157231,3.745368,2.056508,1.526756,1.638721,2.746008,0.0,0.728871,0.248490,...,1.174902,2.501284,1.452303,1.189284,2.883156,1.588075,0.725331,0.163878,1.702431,2.194259
2,0.846063,1.605237,0.805049,0.902661,1.622584,1.147530,1.275458,0.0,2.153320,0.926055,...,1.184017,0.880394,1.806954,1.442235,0.801363,1.626994,0.568042,0.000000,1.896320,0.976791
3,1.093408,1.223410,1.874324,2.219434,0.879066,2.298780,1.477697,0.0,2.164646,2.043216,...,1.126441,1.279361,2.040087,1.496110,1.981401,1.202085,0.585374,0.113964,1.038824,0.465566
4,1.628255,1.978409,1.783667,0.579625,1.048298,1.604363,1.627460,0.0,1.761667,0.639846,...,1.444271,1.143819,1.492713,1.309319,0.426623,2.758231,0.724342,0.000000,1.823411,1.396208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.228411,1.137532,1.340785,1.927307,2.104715,0.949438,2.384785,0.0,1.818742,1.342715,...,1.269412,2.555556,2.009609,1.011214,1.135506,1.322723,0.672355,0.000000,1.523899,0.802592
30079,1.395945,1.438516,2.053834,1.840734,1.080493,2.201542,1.104149,0.0,2.985705,1.766386,...,1.134073,1.053623,1.217309,1.292022,1.524583,0.577702,0.827970,0.000000,1.983905,1.054130
30080,0.937448,1.242203,2.101459,1.849006,1.106512,1.358821,2.112649,0.0,1.165469,0.969044,...,2.323654,3.102413,2.430577,1.500344,2.230500,2.033732,0.599059,0.000000,1.299392,1.479758
30081,1.151241,1.584572,0.763552,0.840122,1.344241,0.730238,1.048365,0.0,1.512467,1.206768,...,1.418048,1.720290,1.908568,1.478042,0.804574,1.925837,0.350558,0.270158,2.035101,1.448286


## パラメータを最適化する

In [68]:
target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X_dropped[train_idx],X_dropped[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

In [69]:
%%time

from optuna.integration import lightgbm as lgb

def optimize_params():
    for n,col_name in enumerate(CFG.target_cols):
        y_train_col,y_val_col=y_train[col_name],y_val[col_name]

        lgb_train=lgb.Dataset(X_train,label=y_train_col)
        lgb_val=lgb.Dataset(X_val,label=y_val_col,reference=lgb_train)
        
        params={
            "task":"train",
            "boosting_type":"gbdt",
            "objective":"binary",
            "metric":"auc",
            "learning_rate":0.1,
            "num_iterations":100
            # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
        }

        opt=lgb.train(params,lgb_train,valid_sets=lgb_val, verbose_eval=100)
        pickle.dump(opt.params,open(f"{CFG.models_dir}lgbm_effnet_tuned_dropped/params_{n}.pickle","wb"))

optimize_params()

ith positive gain, best gain: -inf
[100]	valid_0's auc: 0.995529
regularization_factors, val_score: 0.997737:  95%|#########5| 19/20 [00:15<00:00,  1.29it/s][32m[I 2021-03-10 01:14:19,303][0m Trial 58 finished with value: 0.9955291313135121 and parameters: {'lambda_l1': 9.785642380466738, 'lambda_l2': 1.0087645488154309e-08}. Best is trial 42 with value: 0.9977368877289023.[0m
regularization_factors, val_score: 0.997737:  95%|#########5| 19/20 [00:15<00:00,  1.29it/s][LightGBM] [Info] Number of positive: 597, number of negative: 21965
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18919
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026460 -> initscore=-3.605288
[LightGBM] [Info] Start training from score -3.605288
regularization_factors, val_score: 0.997737: 100%|##########| 20/20 [00:16<00:00,  1.30it/s][32m[I 2021-03-10 01:14:20,063][0m Trial 5

### 最適化に要した時間
lr num_it, default: 19min 49s  
lr \*0.1, num_it \*10: 2h 16min 1s  
lr num_it,default: 14min 26s  
lr num_it,default: 13mins 1s

## 得られたパラメータで予測する

In [70]:
import lightgbm

num_features=100

def get_pred(train,val,col_idx:int):
    X_train,y_train=train
    X_val,y_val=val
    col_name=CFG.target_cols[col_idx]
    y_train_col,y_val_col=y_train[col_name],y_val[col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train_col)
    lgb_test=lightgbm.Dataset(X_val,label=y_val_col,reference=lgb_train)

    params=pickle.load(open(f"{CFG.models_dir}lgbm_effnet_tuned_dropped/params_{col_idx}.pickle","rb"))
    params["early_stopping_rounds"]=1000

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=False)
    pred=model.predict(X_val)
    auc=roc_auc_score(y_val_col,pred)

    return pred,auc


results=pd.DataFrame(columns=CFG.target_cols)

for n in range(CFG.n_folds):
    print(f"\nfold - {n}")
    train_idx=(fold["folds"]!=n)
    val_idx=(fold["folds"]==n)
    X_train,X_val=X[train_idx],X[val_idx]
    y_train,y_val=train[train_idx],train[val_idx] 

    for col_idx,col_name in enumerate(CFG.target_cols):
        _,auc=get_pred(train=(X_train,y_train),val=(X_val,y_val),col_idx=col_idx)
        results.loc[f"fold - {n}",col_name]=auc


display(results,results.mean(axis=1).mean())

nt
[LightGBM] [Info] Total Bins 18933
[LightGBM] [Info] Number of data points in the train set: 22563, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002881 -> initscore=-5.846794
[LightGBM] [Info] Start training from score -5.846794
[LightGBM] [Info] Number of positive: 903, number of negative: 21660
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18933
[LightGBM] [Info] Number of data points in the train set: 22563, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.040021 -> initscore=-3.177500
[LightGBM] [Info] Start training from score -3.177500
[LightGBM] [Info] Number of positive: 5434, number of negative: 17129
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18933
[LightGBM] [Info] Number of data points in the train set: 22563, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240837 -> initscore=-1.148097
[LightGBM]

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold - 0,0.955981,0.952795,0.991333,0.911096,0.888468,0.971002,0.964617,0.806005,0.790664,0.832617,0.996692
fold - 1,0.929976,0.955209,0.988983,0.885601,0.888437,0.964323,0.966927,0.830175,0.772471,0.839705,0.998796
fold - 2,0.969886,0.961212,0.991324,0.905789,0.889011,0.973936,0.967433,0.813609,0.792473,0.829784,0.999524
fold - 3,0.966684,0.95461,0.992587,0.877,0.879676,0.970966,0.963667,0.810756,0.775176,0.831134,0.998383


0.9135566549048545

## モデルを保存する

In [71]:
import lightgbm

target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X_dropped[train_idx],X_dropped[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

for i,col_name in enumerate(CFG.target_cols):
    y_train_col=y_train[col_name]
    y_val_col=y_val[col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train_col)
    lgb_val=lightgbm.Dataset(X_val,label=y_val_col,reference=lgb_train)
    
    params=pickle.load(open(f"{CFG.models_dir}lgbm_effnet_tuned_dropped/params_{i}.pickle","rb"))    
    params["early_stopping_rounds"]=500

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_val,verbose_eval=False)
    pickle.dump(model,open(f"{CFG.models_dir}lgbm_effnet_tuned_dropped/model_{i}.pickle","wb"))

[LightGBM] [Info] Number of positive: 61, number of negative: 22501
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18931
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002704 -> initscore=-5.910441
[LightGBM] [Info] Start training from score -5.910441
[LightGBM] [Info] Number of positive: 875, number of negative: 21687
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18931
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038782 -> initscore=-3.210244
[LightGBM] [Info] Start training from score -3.210244
[LightGBM] [Info] Number of positive: 5351, number of negative: 17211
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18931
[LightGBM] [Info] Number of data points in the train set: 22

In [76]:
importances=pd.DataFrame(columns=CFG.target_cols)

for i,col_name in enumerate(CFG.target_cols):
    model=pickle.load(open(f"./models/lgbm_effnet_tuned_dropped/model_{i}.pickle","rb"))
    importances[col_name]=pd.Series(model.feature_importance())

valueless_rows=importances.where(importances.sum(axis=1)==0).dropna(how="all")
display(valueless_rows.index)

Int64Index([ 4,  7,  9, 10, 12, 13, 20, 25, 27, 30, 31, 36, 44, 47, 48, 51, 54,
            64, 65, 71, 73, 74, 75, 89, 92, 97],
           dtype='int64')