In [6]:
import pandas as pd
import numpy as np
from IPython.display import display
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

In [7]:
class CFG:
    dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
    n_folds=3
    target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
       'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']
    cvc_cols=['CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal']
    non_cvc_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
        'NGT - Incompletely Imaged', 'NGT - Normal', 'Swan Ganz Catheter Present']

In [8]:
train=pd.read_csv(f"{CFG.dataset_dir}train.csv")

In [9]:
material_dir="../input/ensemble_material/"
cvc=pd.read_csv(material_dir+"cvc_effnet.csv")
lgbm=pd.read_csv(material_dir+"effnet_lgbm.csv")
dense=pd.read_csv(material_dir+"effnet_dense.csv")

cvc=cvc[CFG.cvc_cols]
lgbm=lgbm[CFG.target_cols]
dense=dense[CFG.target_cols]

In [10]:
scaler=StandardScaler()

cvc_norm=pd.DataFrame(scaler.fit_transform(cvc),columns=CFG.cvc_cols)
lgbm_norm=pd.DataFrame(scaler.fit_transform(lgbm),columns=CFG.target_cols)
dense_norm=pd.DataFrame(scaler.fit_transform(dense),columns=CFG.target_cols)

In [39]:
def get_fold(train):
    fold=train.copy()
    splitter=GroupKFold(n_splits=CFG.n_folds)
    for n,(train_idx,val_idx) in enumerate(splitter.split(train,groups=train["PatientID"])):
        fold.loc[val_idx,"folds"]=n
    fold["folds"]=fold["folds"].astype(int)
    return fold

fold=get_fold(train)

In [35]:
%%time

def get_param_results_2(train_idx,col_name):
    results_list=[]
    for p in [i*0.01 for i in range(101)]:
        ensemble_pred=p*lgbm_norm[col_name][train_idx]+(1-p)*dense_norm[col_name][train_idx]
        score=roc_auc_score(train[col_name][train_idx],ensemble_pred)
        results_list.append([p,score])
    results=pd.DataFrame(results_list,columns=["p","score"])
    return results

def get_param_results_3(train_idx,col_name):
    results_list=[]
    for p in [i*0.01 for i in range(101)]:
        rest=(1-p)*100
        for q in [i*0.01 for i in range(int(rest)+1)]:
            ensemble_pred=p*lgbm_norm[col_name]+q*dense_norm[col_name]+(1-p-q)*cvc_norm[col_name]
            score=roc_auc_score(train[col_name],ensemble_pred)
            results_list.append([p,q,score])
    results=pd.DataFrame(results_list,columns=["p","q","score"])
    return results

CVC - Abnormal 0.0 0.1
CVC - Borderline 0.0 0.11
CVC - Normal 0.0 0.16
Wall time: 2min 47s


'\nfor col_name in CFG.non_cvc_cols:\n    results=get_param_results_2(col_name)\n    best_p=results.at[results["score"].idxmax(),"p"]\n    print(col_name,best_p)\n'

In [48]:
%%time

best_params_list=[]

for n in range(CFG.n_folds):
    train_idx=(fold["folds"]!=n)
    best_params=pd.DataFrame(index=["p","q"],columns=CFG.target_cols)
    for col_name in CFG.non_cvc_cols:
        results=get_param_results_2(col_name)
        best_p=results.at[results["score"].idxmax(),"p"]
        best_params.at["p",col_name]=best_p

    for col_name in CFG.cvc_cols:
        results=get_param_results_3(col_name)
        best_p=results.at[results["score"].idxmax(),"p"]
        best_q=results.at[results["score"].idxmax(),"q"]
        best_params.at["p",col_name]=best_p
        best_params.at["q",col_name]=best_q
    
    best_params_list.append(best_params)
    
for df in best_params_list:
    display(df)

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
p,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
q,,,,,,,,0.1,0.11,0.16,


Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
p,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
q,,,,,,,,0.1,0.11,0.16,


Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
p,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
q,,,,,,,,0.1,0.11,0.16,


Wall time: 11min 43s


In [1]:
best_params={
    'ETT - Abnormal':0,
    'ETT - Borderline':0,
    'ETT - Normal':0,
    'NGT - Abnormal':0,
    'NGT - Borderline':0.01,
    'NGT - Incompletely Imaged':0.01,
    'NGT - Normal':0,
    'CVC - Abnormal':[0,0.1],
    'CVC - Borderline':[0,0.11],
    'CVC - Normal':[0,0.16],
    'Swan Ganz Catheter Present':0
}

In [5]:
%%time

def get_param_results_2(col_name):
    p=best_params[col_name]
    ensemble_pred=p*lgbm_norm[col_name]+(1-p)*dense_norm[col_name]
    score=roc_auc_score(train[col_name],ensemble_pred)
    return score

def get_param_results_3(col_name):
    p=best_params[col_name][0]
    q=best_params[col_name][1]
    ensemble_pred=p*lgbm_norm[col_name]+q*dense_norm[col_name]+(1-p-q)*cvc_norm[col_name]
    score=roc_auc_score(train[col_name],ensemble_pred)
    return score

Wall time: 0 ns


In [17]:
auc=pd.DataFrame(columns=CFG.target_cols)

for col_name in CFG.target_cols:
    auc.loc["dense",col_name]=roc_auc_score(train[col_name],dense_norm[col_name])
    auc.loc["lgbm",col_name]=roc_auc_score(train[col_name],lgbm_norm[col_name])

for col_name in CFG.cvc_cols:
    auc.loc["ensemble",col_name]=get_param_results_3(col_name)
for col_name in CFG.non_cvc_cols:
    auc.loc["ensemble",col_name]=get_param_results_2(col_name)
display(auc,auc.mean(axis=1))

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
dense,0.986835,0.984116,0.996592,0.978768,0.970776,0.991322,0.991293,0.943553,0.904939,0.952546,0.999313
lgbm,0.417352,0.465316,0.723729,0.619444,0.458764,0.497249,0.62394,0.749897,0.413652,0.677142,0.622568
ensemble,0.986835,0.984116,0.996592,0.978768,0.971097,0.991408,0.991293,0.97648,0.950332,0.97773,0.999313


dense       0.972732
lgbm        0.569914
ensemble    0.982179
dtype: float64

get_param_results_3: **55s** for one column