In [13]:
import pandas as pd
import numpy as np
from IPython.display import display
import pickle
from tqdm import tqdm

from tensorflow.keras import models
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

In [11]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    models_dir="./models/"
    n_folds=3
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
       'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']
    cvc_cols=['CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal']
    non_cvc_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
        'NGT - Incompletely Imaged', 'NGT - Normal', 'Swan Ganz Catheter Present']

In [3]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

In [4]:
material_dir="../input/ensemble_material/"
cvc=pd.read_csv(material_dir+"cvc_effnet.csv")
lgbm=pd.read_csv(material_dir+"effnet_lgbm.csv")
dense=pd.read_csv(material_dir+"effnet_dense.csv")

cvc=cvc[CFG.cvc_cols]
lgbm=lgbm[CFG.target_cols]
dense=dense[CFG.target_cols]

In [8]:
npz=np.load("../input/effnet_best_output.npz")
features=[npz[uid] for uid in tqdm(train["StudyInstanceUID"])]
features=np.array(features)

100%|██████████| 30083/30083 [01:02<00:00, 483.57it/s]


In [14]:
def compress_with_autoencoder(features):
    scaler=pickle.load(open(f"{CFG.models_dir}minmaxscaler_effnet_best.pickle","rb"))
    X=scaler.transform(features)

    autoencoder_dir=f"{CFG.models_dir}autoencoder_best/"
    with open(f"{autoencoder_dir}model.json","rt") as f:
        model_json=f.read()
    autoencoder=models.model_from_json(model_json)
    autoencoder.load_weights(f"{autoencoder_dir}ckpt")

    layer_name="dense_1"
    compressing_model=models.Model(inputs=autoencoder.input,outputs=autoencoder.get_layer(layer_name).output)

    ae_pred=compressing_model.predict(X)
    ae_pred_df=pd.DataFrame(ae_pred)

    return ae_pred_df

X=compress_with_autoencoder(features)



In [15]:
valuless_columns=[3, 4, 6, 8, 9, 13, 14, 17, 18, 23, 27, 35, 36, 37, 38, 44, 45,
            47, 50, 51, 52, 57, 58, 61, 62, 67, 68, 72, 73, 74, 76, 85, 86, 87,
            90, 91, 92, 98, 99]
X_dropped=X.drop(columns=valuless_columns)
display(X_dropped)

Unnamed: 0,0,1,2,5,7,10,11,12,15,16,...,82,83,84,88,89,93,94,95,96,97
0,2.673167,2.893720,0.450785,1.060405,2.383530,0.602804,0.247274,4.546686,2.601023,2.746938,...,2.845101,1.222309,2.439303,0.743159,5.553790,1.139015,2.191016,1.756449,1.531280,5.458642
1,2.044053,1.257797,4.117229,1.095967,2.815526,2.502600,2.922100,1.949331,4.941265,0.661706,...,1.585460,3.151552,4.557871,2.256856,1.380924,2.510919,1.794880,1.713382,1.276317,0.395079
2,0.957066,2.805675,2.284648,0.570206,1.604300,1.607768,1.134656,3.969714,2.496226,1.417868,...,1.166087,0.489565,1.096341,0.000000,1.794171,1.730868,2.893358,1.455244,1.804047,2.715654
3,0.934222,4.424170,3.021049,0.521623,2.399590,1.063175,0.865115,3.438043,2.433294,3.138260,...,1.842543,1.764457,2.348570,0.000000,3.490625,2.038467,2.977322,1.248150,1.353670,2.733515
4,1.122390,1.620926,0.552111,0.531280,1.199930,0.820968,1.542056,1.122361,2.030983,1.158927,...,0.644790,0.362131,0.758877,0.000000,1.675901,2.432779,0.477025,3.001572,1.154578,2.067586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,1.525124,1.713859,2.593170,1.078913,2.174997,2.792061,3.900610,1.244418,5.658386,0.520775,...,2.301839,3.729995,2.448350,1.218580,0.727766,3.751309,1.893991,1.452567,0.570655,1.322180
30079,0.471860,2.453489,0.344327,0.325728,1.755586,0.819340,0.414095,1.660657,2.043858,1.444347,...,0.008733,1.224563,1.617114,0.000000,2.865466,2.107560,0.623374,2.476524,2.001624,1.053715
30080,1.988157,2.386232,2.376373,1.295946,2.409492,3.035921,3.528997,6.007380,4.470945,0.981653,...,2.296206,4.804539,3.810356,0.576834,1.762171,4.163200,2.853935,1.027527,1.148571,2.443722
30081,1.163135,3.123357,0.631640,0.325775,1.266896,1.538262,0.813791,4.718347,2.864160,2.886163,...,0.812288,0.145438,0.807039,0.000000,2.420322,0.766864,2.288523,1.675684,2.482395,3.231958


In [17]:
for i,col_name in enumerate(CFG.target_cols):
    lgbm_model=pickle.load(open(f"./models/lgbm_effnet_best_dropped/model_{i}.pickle","rb"))
    pred=lgbm_model.predict(X_dropped)
    score=roc_auc_score(train[col_name],lgbm[col_name])
    print(score)

0.417351737067969
0.46531619722393325
0.7237285719019306
0.6194439874563997
0.4587642459741439
0.49724948872805447
0.6239400619019727
0.7498967548222989
0.41365159122110695
0.6771418530114289
0.6225676781580223


In [10]:
scaler=StandardScaler()

cvc_norm=pd.DataFrame(scaler.fit_transform(cvc),columns=CFG.cvc_cols)
lgbm_norm=pd.DataFrame(scaler.fit_transform(lgbm),columns=CFG.target_cols)
dense_norm=pd.DataFrame(scaler.fit_transform(dense),columns=CFG.target_cols)

In [39]:
def get_fold(train):
    fold=train.copy()
    splitter=GroupKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,groups=train["PatientID"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

In [35]:
%%time

def get_param_results_2(train_idx,col_name):
    results_list=[]
    for p in [i*0.01 for i in range(101)]:
        ensemble_pred=p*lgbm_norm[col_name][train_idx]+(1-p)*dense_norm[col_name][train_idx]
        score=roc_auc_score(train[col_name][train_idx],ensemble_pred)
        results_list.append([p,score])
    results=pd.DataFrame(results_list,columns=["p","score"])
    return results

def get_param_results_3(train_idx,col_name):
    results_list=[]
    for p in [i*0.01 for i in range(101)]:
        rest=(1-p)*100
        for q in [i*0.01 for i in range(int(rest)+1)]:
            ensemble_pred=p*lgbm_norm[col_name]+q*dense_norm[col_name]+(1-p-q)*cvc_norm[col_name]
            score=roc_auc_score(train[col_name],ensemble_pred)
            results_list.append([p,q,score])
    results=pd.DataFrame(results_list,columns=["p","q","score"])
    return results

CVC - Abnormal 0.0 0.1
CVC - Borderline 0.0 0.11
CVC - Normal 0.0 0.16
Wall time: 2min 47s


'\nfor col_name in CFG.non_cvc_cols:\n    results=get_param_results_2(col_name)\n    best_p=results.at[results["score"].idxmax(),"p"]\n    print(col_name,best_p)\n'

In [48]:
%%time

best_params_list=[]

for n in range(CFG.n_folds):
    train_idx=(fold["folds"]!=n)
    best_params=pd.DataFrame(index=["p","q"],columns=CFG.target_cols)
    for col_name in CFG.non_cvc_cols:
        results=get_param_results_2(col_name)
        best_p=results.at[results["score"].idxmax(),"p"]
        best_params.at["p",col_name]=best_p

    for col_name in CFG.cvc_cols:
        results=get_param_results_3(col_name)
        best_p=results.at[results["score"].idxmax(),"p"]
        best_q=results.at[results["score"].idxmax(),"q"]
        best_params.at["p",col_name]=best_p
        best_params.at["q",col_name]=best_q
    
    best_params_list.append(best_params)
    
for df in best_params_list:
    display(df)

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
p,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
q,,,,,,,,0.1,0.11,0.16,


Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
p,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
q,,,,,,,,0.1,0.11,0.16,


Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
p,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
q,,,,,,,,0.1,0.11,0.16,


Wall time: 11min 43s


In [1]:
best_params={
    'ETT - Abnormal':0,
    'ETT - Borderline':0,
    'ETT - Normal':0,
    'NGT - Abnormal':0,
    'NGT - Borderline':0.01,
    'NGT - Incompletely Imaged':0.01,
    'NGT - Normal':0,
    'CVC - Abnormal':[0,0.1],
    'CVC - Borderline':[0,0.11],
    'CVC - Normal':[0,0.16],
    'Swan Ganz Catheter Present':0
}

In [5]:
%%time

def get_param_results_2(col_name):
    p=best_params[col_name]
    ensemble_pred=p*lgbm_norm[col_name]+(1-p)*dense_norm[col_name]
    score=roc_auc_score(train[col_name],ensemble_pred)
    return score

def get_param_results_3(col_name):
    p=best_params[col_name][0]
    q=best_params[col_name][1]
    ensemble_pred=p*lgbm_norm[col_name]+q*dense_norm[col_name]+(1-p-q)*cvc_norm[col_name]
    score=roc_auc_score(train[col_name],ensemble_pred)
    return score

Wall time: 0 ns


In [17]:
auc=pd.DataFrame(columns=CFG.target_cols)

for col_name in CFG.target_cols:
    auc.loc["dense",col_name]=roc_auc_score(train[col_name],dense_norm[col_name])
    auc.loc["lgbm",col_name]=roc_auc_score(train[col_name],lgbm_norm[col_name])

for col_name in CFG.cvc_cols:
    auc.loc["ensemble",col_name]=get_param_results_3(col_name)
for col_name in CFG.non_cvc_cols:
    auc.loc["ensemble",col_name]=get_param_results_2(col_name)
display(auc,auc.mean(axis=1))

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
dense,0.986835,0.984116,0.996592,0.978768,0.970776,0.991322,0.991293,0.943553,0.904939,0.952546,0.999313
lgbm,0.417352,0.465316,0.723729,0.619444,0.458764,0.497249,0.62394,0.749897,0.413652,0.677142,0.622568
ensemble,0.986835,0.984116,0.996592,0.978768,0.971097,0.991408,0.991293,0.97648,0.950332,0.97773,0.999313


dense       0.972732
lgbm        0.569914
ensemble    0.982179
dtype: float64

get_param_results_3: **55s** for one column