In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt

from tensorflow.keras import models

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

## CFG

In [2]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/"

    n_folds=10
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

In [3]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

npz=np.load("../input/effnet_tuned_output.npz")
features_list=[npz[uid] for uid in tqdm(train["StudyInstanceUID"])]
features=np.array(features_list)

100%|██████████| 30083/30083 [00:56<00:00, 530.07it/s]


In [4]:
def get_fold(train):
    fold=train.copy()
    splitter=GroupKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,groups=train["PatientID"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

### AutoEncoderで次元削減する

In [5]:
def compress_with_autoencoder(features):
    scaler=pickle.load(open(f"{CFG.models_dir}minmaxscaler_effnet_best.pickle","rb"))
    X=scaler.transform(features)

    autoencoder_dir=f"{CFG.models_dir}autoencoder_best/"
    with open(f"{autoencoder_dir}model.json","rt") as f:
        model_json=f.read()
    autoencoder=models.model_from_json(model_json)
    autoencoder.load_weights(f"{autoencoder_dir}ckpt")

    layer_name="dense_1"
    compressing_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

    ae_pred=compressing_model.predict(X)
    ae_pred_df=pd.DataFrame(ae_pred)

    return ae_pred_df

X=compress_with_autoencoder(features)



In [6]:
valuless_columns=[3, 4, 6, 8, 9, 13, 14, 17, 18, 23, 27, 35, 36, 37, 38, 44, 45,
            47, 50, 51, 52, 57, 58, 61, 62, 67, 68, 72, 73, 74, 76, 85, 86, 87,
            90, 91, 92, 98, 99]
X_dropped=X.drop(columns=valuless_columns)
display(X_dropped)

Unnamed: 0,0,1,2,5,7,10,11,12,15,16,...,82,83,84,88,89,93,94,95,96,97
0,15.975632,2.598794,28.477423,10.108122,17.627970,31.613720,13.680086,29.980919,43.547512,35.329071,...,12.405697,13.718209,0.000000,18.162479,2.853658,11.579735,33.015427,0.000000,1.013134,0.000000
1,18.253782,1.896787,29.749559,10.739809,13.350595,31.290251,17.574186,31.453568,43.391998,34.259590,...,5.430474,13.750910,4.117984,19.033838,1.866350,20.918510,33.531281,1.492328,0.169154,0.000000
2,21.113445,6.490278,37.285770,14.575913,21.804926,35.283077,19.106745,31.540510,46.289322,41.946312,...,15.540805,21.033930,3.303513,14.330857,6.227699,19.798576,36.662624,0.000000,3.031176,0.000000
3,20.578976,4.586390,35.011791,13.960997,21.497570,32.601837,17.303205,29.510359,43.780560,40.917683,...,13.879373,19.960590,5.534728,11.691888,6.688281,18.260839,35.191917,0.000000,4.315998,0.000000
4,21.259962,1.174299,32.234509,14.236063,16.974590,39.146740,21.910166,35.963516,46.976074,42.572781,...,8.802354,17.031981,0.000000,19.087912,3.070636,23.272400,36.912308,0.000000,3.029698,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,16.151951,3.921811,26.502596,8.552945,14.540611,26.849852,13.892633,25.795586,40.366196,31.494781,...,10.371680,13.132877,3.722044,15.343826,3.803681,14.631236,31.437992,0.000000,1.348760,0.000000
30079,23.679729,4.203311,36.897343,17.462229,23.211433,39.100208,22.401089,35.309669,48.646877,47.263458,...,12.679943,20.948107,1.561532,15.763021,6.220365,23.946432,36.339615,0.000000,4.819793,0.000000
30080,16.755604,4.080050,27.205835,8.939780,14.894520,25.983810,14.067039,24.994057,38.945438,30.419767,...,10.008484,13.232534,4.731323,14.609595,3.877387,14.661573,30.467600,0.774718,1.180949,0.000000
30081,22.814905,7.761188,39.397442,15.905684,23.822273,37.408810,21.016159,32.534973,49.597984,44.645538,...,16.606625,23.253817,5.279211,14.986032,7.915530,23.222389,38.238205,0.220387,4.085468,0.000000


## パラメータを最適化する

In [8]:
target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X_dropped[train_idx],X_dropped[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

In [9]:
%%time

from optuna.integration import lightgbm as lgb

def optimize_params():
    for n,col_name in enumerate(CFG.target_cols):
        y_train_col,y_val_col=y_train[col_name],y_val[col_name]

        lgb_train=lgb.Dataset(X_train,label=y_train_col)
        lgb_val=lgb.Dataset(X_val,label=y_val_col,reference=lgb_train)
        
        params={
            "task":"train",
            "boosting_type":"gbdt",
            "objective":"binary",
            "metric":"auc",
            "learning_rate":0.1, #0.01
            "num_iterations":100 #1000
            # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
        }

        opt=lgb.train(params,lgb_train,valid_sets=lgb_val, verbose_eval=100)
        pickle.dump(opt.params,open(f"{CFG.models_dir}lgbm_effnet_best/params_{n}.pickle","wb"))

optimize_params()

ue: 0.9992211377304545 and parameters: {'lambda_l1': 0.010968265726788731, 'lambda_l2': 0.00018513724320121008}. Best is trial 43 with value: 0.9993640934634723.[0m
regularization_factors, val_score: 0.999527:  20%|##        | 4/20 [00:02<00:10,  1.54it/s][100]	valid_0's auc: 0.999221
[LightGBM] [Info] Number of positive: 761, number of negative: 26313
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15563
[LightGBM] [Info] Number of data points in the train set: 27074, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028108 -> initscore=-3.543185
[LightGBM] [Info] Start training from score -3.543185
regularization_factors, val_score: 0.999527:  25%|##5       | 5/20 [00:03<00:09,  1.55it/s][32m[I 2021-03-16 18:55:49,157][0m Trial 47 finished with value: 0.9994035295277531 and parameters: {'lambda_l1': 2.6638198961999587e-07, 'lambda_l2': 2.5257328883401513}. Best is trial 47 with value: 0.9994035295277531.[0m
regula

### 最適化に要した時間
lr num_it, default: 19min 49s  
lr \*0.1, num_it \*10: 2h 16min 1s  
lr num_it,default: 14min 26s  
lr num_it,default: 13mins 1s  
lr num_it,default: 14min 38s  
lr \*0.1, num_it \*10: 1h 42min 3s  
best, dropped: 1h 38min 35s  
lr num_it, default: 13min 26s

## 得られたパラメータで予測する

In [10]:
import lightgbm

num_features=100

def get_pred(train,val,col_idx:int):
    X_train,y_train=train
    X_val,y_val=val
    col_name=CFG.target_cols[col_idx]
    y_train_col,y_val_col=y_train[col_name],y_val[col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train_col)
    lgb_test=lightgbm.Dataset(X_val,label=y_val_col,reference=lgb_train)

    params=pickle.load(open(f"{CFG.models_dir}lgbm_effnet_best/params_{col_idx}.pickle","rb"))
    params["early_stopping_rounds"]=1000

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=False)
    pred=model.predict(X_val)
    auc=roc_auc_score(y_val_col,pred)

    return pred,auc


results=pd.DataFrame(columns=CFG.target_cols)

for n in range(4):
    print(f"\nfold - {n}")
    train_idx=(fold["folds"]!=n)
    val_idx=(fold["folds"]==n)
    X_train,X_val=X_dropped[train_idx],X_dropped[val_idx]
    y_train,y_val=train[train_idx],train[val_idx] 

    for col_idx,col_name in enumerate(CFG.target_cols):
        _,auc=get_pred(train=(X_train,y_train),val=(X_val,y_val),col_idx=col_idx)
        results.loc[f"fold - {n}",col_name]=auc


display(results,results.mean(axis=1).mean())

sitive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 494, number of negative: 26581
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15555
[LightGBM] [Info] Number of data points in the train set: 27075, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018246 -> initscore=-3.985416
[LightGBM] [Info] Start training from score -3.985416
[LightGBM] [Info] Number of positive: 2437, number of negative: 24638
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15555
[LightGBM] [Info] Number of data points in the train set: 27075, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090009 -> initscore=-2.313522
[LightGBM] [Info] Start training from score -2.313522
[LightGBM] [Info] Number of positive: 4350, number of negative: 22725
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15555
[LightGBM] [Info] Number of d

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold - 0,0.980692,0.969549,0.995741,0.901422,0.920396,0.976525,0.975013,0.846312,0.786867,0.857186,0.999443
fold - 1,0.878253,0.963632,0.992435,0.904422,0.895277,0.978161,0.976253,0.858665,0.823853,0.868736,0.999914
fold - 2,0.981322,0.961637,0.991482,0.866276,0.895654,0.976734,0.970884,0.858952,0.823529,0.864922,0.999623
fold - 3,0.967665,0.965229,0.995102,0.9186,0.912445,0.982528,0.980811,0.838656,0.822641,0.875814,0.997596


0.9272011609223824

In [9]:
display(results,results.mean(axis=1))

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
fold - 0,0.973369,0.973097,0.995879,0.907734,0.917653,0.978197,0.975421,0.847992,0.791439,0.85961,0.999596
fold - 1,0.958959,0.96286,0.992628,0.918469,0.897845,0.979059,0.976285,0.858765,0.81947,0.868902,0.999873
fold - 2,0.983535,0.962376,0.99177,0.879977,0.900918,0.977728,0.970469,0.8574,0.818245,0.868019,0.998987
fold - 3,0.938969,0.965897,0.994806,0.927158,0.918764,0.982168,0.980094,0.836751,0.818852,0.876296,0.995032
fold - 4,0.881985,0.969985,0.993558,0.904481,0.910657,0.978309,0.981756,0.856186,0.821953,0.886719,0.998934


fold - 0    0.929090
fold - 1    0.930283
fold - 2    0.928129
fold - 3    0.930435
fold - 4    0.925866
dtype: float64

lgbm_effnet_tuned: 0.91343  
lgbm_effnet_tuned_dropped: 0.914  
lgbm_effnet_tuned_theta: 0.9099249788193785  
lgbm_effnet_tuned_smallLR (dropped): 0.9155745942231474  
lgbm_effnet_best_dropped: 0.9287606420966457  
lgbm_effnet_best: 0.9272011609223824

## モデルを保存する

In [9]:
import lightgbm

target_fold=0
train_idx=(fold["folds"]!=target_fold)
val_idx=(fold["folds"]==target_fold)

X_train,X_val=X_dropped[train_idx],X_dropped[val_idx]
y_train,y_val=train[train_idx],train[val_idx]

for i,col_name in enumerate(CFG.target_cols):
    y_train_col=y_train[col_name]
    y_val_col=y_val[col_name]

    lgb_train=lightgbm.Dataset(X_train,label=y_train_col)
    lgb_val=lightgbm.Dataset(X_val,label=y_val_col,reference=lgb_train)
    
    params=pickle.load(open(f"{CFG.models_dir}lgbm_effnet_best_dropped/params_{i}.pickle","rb"))    
    params["early_stopping_rounds"]=1000

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_val,verbose_eval=False)
    pickle.dump(model,open(f"{CFG.models_dir}lgbm_effnet_best_dropped/model_{i}.pickle","wb"))

: -inf
[LightGBM] [Info] Number of positive: 6477, number of negative: 20597
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15555
[LightGBM] [Info] Number of data points in the train set: 27074, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.239233 -> initscore=-1.156888
[LightGBM] [Info] Start training from score -1.156888
[LightGBM] [Info] Number of positive: 244, number of negative: 26830
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15555
[LightGBM] [Info] Number of data points in the train set: 27074, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009012 -> initscore=-4.700108
[LightGBM] [Info] Start training from score -4.700108
[LightGBM] [Info] Number of positive: 464, number of negative: 26610
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15555
[LightGBM] [Info] Number of data points in the train

In [15]:
importances=pd.DataFrame(columns=CFG.target_cols)

for i,col_name in enumerate(CFG.target_cols):
    model=pickle.load(open(f"./models/lgbm_effnet_best/model_{i}.pickle","rb"))
    importances[col_name]=pd.Series(model.feature_importance())

valueless_rows=importances.where(importances.sum(axis=1)==0).dropna(how="all")
display(valueless_rows.index)

Int64Index([ 3,  4,  6,  8,  9, 13, 14, 17, 18, 23, 27, 35, 36, 37, 38, 44, 45,
            47, 50, 51, 52, 57, 58, 61, 62, 68, 72, 73, 74, 76, 85, 86, 87, 90,
            91, 92, 98, 99],
           dtype='int64')