In [11]:
import pandas as pd
import numpy as np
from IPython.display import display
import pickle
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from tensorflow.keras import models

In [14]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/"
    n_folds=4
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
       'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']

In [3]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

In [4]:
cvc_cols=[col_name for col_name in CFG.target_cols if ("CVC" in col_name)]

train["CVC Present"]=train[cvc_cols].sum(axis=1)
false_idx=(train["CVC Present"]==0)
train["CVC Present"]=1
train.loc[false_idx,"CVC Present"]=0
display(train["CVC Present"].value_counts())

1    29333
0      750
Name: CVC Present, dtype: int64

In [6]:
def get_fold(train):
    fold=train.copy()
    splitter=StratifiedKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,train["CVC Present"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

In [12]:
npz=np.load("../input/effnet_tuned_output.npz")
features_list=[npz[uid] for uid in tqdm(train["StudyInstanceUID"])]
features=np.array(features_list)

100%|██████████| 30083/30083 [01:01<00:00, 492.60it/s]


In [16]:
def compress_with_autoencoder(features):
    scaler=pickle.load(open(f"{CFG.models_dir}minmaxscaler_effnet_tuned.pickle","rb"))
    X=scaler.transform(features)

    autoencoder_dir=f"{CFG.models_dir}autoencoder_tuned/"
    with open(f"{autoencoder_dir}model.json","rt") as f:
        model_json=f.read()
    autoencoder=models.model_from_json(model_json)
    autoencoder.load_weights(f"{autoencoder_dir}ckpt")

    layer_name="dense_1"
    compressing_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

    ae_pred=compressing_model.predict(X)
    ae_pred_df=pd.DataFrame(ae_pred)

    return ae_pred_df

X=compress_with_autoencoder(features)
valuless_columns=[4,  7,  9, 12, 13, 20, 25, 27, 30, 31, 36, 44, 47, 48, 51, 54, 64, 65, 71, 73, 74, 75, 89, 92, 97]
X=X.drop(columns=valuless_columns)
display(X)



Unnamed: 0,0,1,2,3,5,6,8,10,11,14,...,87,88,90,91,93,94,95,96,98,99
0,1.843573,0.762214,0.869982,1.386810,1.369750,0.943624,0.868235,0.0,2.362543,1.963967,...,0.908345,1.251791,1.660741,0.820461,1.360656,2.405944,1.016294,0.000000,1.393855,0.974065
1,1.316493,1.157231,3.745368,2.056508,1.526756,1.638721,2.746008,0.0,0.728871,0.248490,...,1.174902,2.501284,1.452303,1.189284,2.883156,1.588075,0.725331,0.163878,1.702431,2.194259
2,0.846063,1.605237,0.805049,0.902661,1.622584,1.147530,1.275458,0.0,2.153320,0.926055,...,1.184017,0.880394,1.806954,1.442235,0.801363,1.626994,0.568042,0.000000,1.896320,0.976791
3,1.093408,1.223410,1.874324,2.219434,0.879066,2.298780,1.477697,0.0,2.164646,2.043216,...,1.126441,1.279361,2.040087,1.496110,1.981401,1.202085,0.585374,0.113964,1.038824,0.465566
4,1.628255,1.978409,1.783667,0.579625,1.048298,1.604363,1.627460,0.0,1.761667,0.639846,...,1.444271,1.143819,1.492713,1.309319,0.426623,2.758231,0.724342,0.000000,1.823411,1.396208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.228411,1.137532,1.340785,1.927307,2.104715,0.949438,2.384785,0.0,1.818742,1.342715,...,1.269412,2.555556,2.009609,1.011214,1.135506,1.322723,0.672355,0.000000,1.523899,0.802592
30079,1.395945,1.438516,2.053834,1.840734,1.080493,2.201542,1.104149,0.0,2.985705,1.766386,...,1.134073,1.053623,1.217309,1.292022,1.524583,0.577702,0.827970,0.000000,1.983905,1.054130
30080,0.937448,1.242203,2.101459,1.849006,1.106512,1.358821,2.112649,0.0,1.165469,0.969044,...,2.323654,3.102413,2.430577,1.500344,2.230500,2.033732,0.599059,0.000000,1.299392,1.479758
30081,1.151241,1.584572,0.763552,0.840122,1.344241,0.730238,1.048365,0.0,1.512467,1.206768,...,1.418048,1.720290,1.908568,1.478042,0.804574,1.925837,0.350558,0.270158,2.035101,1.448286


In [19]:
%%time

from optuna.integration import lightgbm as lgb

n=0
train_idx=(fold["folds"]!=n)
val_idx=(fold["folds"]==n)

def optimize_params():
    X_train=X[train_idx]
    X_val=X[val_idx]
    y_train=train["CVC Present"][train_idx]
    y_val=train["CVC Present"][val_idx]

    lgb_train=lgb.Dataset(X_train,label=y_train)
    lgb_test=lgb.Dataset(X_val,label=y_val,reference=lgb_train)
    
    params={
        "task":"train",
        "boosting_type":"gbdt",
        "objective":"binary",
        "metric":"auc",
        "learning_rate":0.1,
        "num_iterations":100
        # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
    }

    opt=lgb.train(params,lgb_train,valid_sets=lgb_test, verbose_eval=50)
    pickle.dump(opt.params,open(f"{CFG.models_dir}lgbm_cvc/params_{n}.pickle","wb"))

optimize_params()

0]	valid_0's auc: 0.931295
regularization_factors, val_score: 0.943094:  55%|#####5    | 11/20 [00:38<00:33,  3.75s/it][32m[I 2021-03-10 05:09:55,390][0m Trial 53 finished with value: 0.9360360190453532 and parameters: {'lambda_l1': 1.057180282765046e-08, 'lambda_l2': 0.00476326831230927}. Best is trial 46 with value: 0.9430938833776777.[0m
regularization_factors, val_score: 0.943094:  55%|#####5    | 11/20 [00:38<00:33,  3.75s/it][100]	valid_0's auc: 0.936036
[LightGBM] [Info] Number of positive: 22000, number of negative: 562
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18916
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.975091 -> initscore=3.667296
[LightGBM] [Info] Start training from score 3.667296
[50]	valid_0's auc: 0.936909
regularization_factors, val_score: 0.943104:  60%|######    | 12/20 [00:42<00:30,  3.78s/it][32m[I 2021-03-10 05:09

In [21]:
import lightgbm


def get_pred(train,val):

    X_train,y_train=train
    X_val,y_val=val

    lgb_train=lightgbm.Dataset(X_train,label=y_train)
    lgb_test=lightgbm.Dataset(X_val,label=y_val,reference=lgb_train)

    params=pickle.load(open(f"{CFG.models_dir}lgbm_cvc_presence/params_0.pickle","rb"))
    params["early_stopping_rounds"]=50

    lgbm_model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=10)
    pred=lgbm_model.predict(X_val)
    auc=roc_auc_score(y_val,pred)

    return pred,auc


results=pd.DataFrame(columns=["CVC Present"])

for n in range(CFG.n_folds):
    print(f"\nfold - {n}")
    train_idx=(fold["folds"]!=n)
    val_idx=(fold["folds"]==n)

    train_data=X[train_idx],train["CVC Present"][train_idx]
    val_data=X[val_idx],train["CVC Present"][val_idx]

    _,auc=get_pred(train=train_data,val=val_data)
        
    results.loc[f"fold_{n}","CVC Present"]=auc


display(results,results.mean(axis=0))


fold - 0
[LightGBM] [Info] Number of positive: 22000, number of negative: 562
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18916
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.975091 -> initscore=3.667296
[LightGBM] [Info] Start training from score 3.667296
Training until validation scores don't improve for 50 rounds
[10]	valid_0's auc: 0.92831
[20]	valid_0's auc: 0.933375
[30]	valid_0's auc: 0.936976
[40]	valid_0's auc: 0.939663
[50]	valid_0's auc: 0.940203
[60]	valid_0's auc: 0.941834
[70]	valid_0's auc: 0.943371
[80]	valid_0's auc: 0.943742
[90]	valid_0's auc: 0.943961
[100]	valid_0's auc: 0.944287
Did not meet early stopping. Best iteration is:
[92]	valid_0's auc: 0.945379

fold - 1
[LightGBM] [Info] Number of positive: 22000, number of negative: 562
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18921
[Li

Unnamed: 0,CVC Present
fold_0,0.945379
fold_1,0.932026
fold_2,0.936827
fold_3,0.937909


CVC Present    0.938035
dtype: float64