In [1]:
import numpy as np
import pandas as pd
import os
from optuna.integration import lightgbm as lgb
import sys
from pprint import pprint
from tqdm import tqdm
import pickle
from IPython.display import display
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

## CFG

In [2]:
numpy_input_dir="../input/efficientnet_output_straight/"
dataset_dir="../input/ranzcr-clip-catheter-line-classification/"
models_dir="./models/"

n_folds=4
target_cols=['ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline', 'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present']


## PCAを行う

In [3]:
array_list=[]
uid_list=[]

for file_name in tqdm(os.listdir(numpy_input_dir)):
    features=np.load(numpy_input_dir+file_name)[0]
    array_list.append(features)
    uid_list.append(os.path.splitext(file_name)[0])

print("\n",features.shape)

100%|██████████| 30083/30083 [00:28<00:00, 1062.18it/s]
 (2560,)



np.array()を介してDataFrameにすることで高速になる

In [4]:
df=pd.DataFrame(np.array(array_list))
df_uid=pd.DataFrame(np.array(uid_list),columns=["StudyInstanceUID"])
df_features=pd.DataFrame(np.array(array_list))
df_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2550,2551,2552,2553,2554,2555,2556,2557,2558,2559
0,0.396515,-0.054081,-0.015864,-0.106942,-0.060530,0.121100,-0.227624,0.119775,-0.090709,-0.079812,...,0.168812,-0.003200,0.125735,-0.217831,-0.186626,0.168541,-0.043564,-0.115363,-0.184646,0.245675
1,0.164006,0.140693,0.032105,-0.176430,-0.083020,0.047258,-0.017787,-0.090380,0.048646,0.035545,...,0.261479,-0.145750,-0.110239,-0.054668,-0.216384,-0.007575,0.334953,-0.193806,0.160989,-0.130775
2,0.002677,0.376625,-0.225340,-0.070885,-0.051425,0.377899,-0.063307,-0.041903,-0.083185,-0.138833,...,0.111567,-0.160592,0.377506,0.044940,-0.054765,-0.200121,0.141178,-0.182268,-0.091767,0.397546
3,-0.060532,0.177028,0.072606,-0.004313,-0.055582,0.114975,0.007457,0.102090,0.064961,0.071155,...,0.569302,-0.009915,0.210871,-0.088671,-0.218210,-0.147902,0.417571,-0.120849,0.276294,0.028954
4,0.183061,0.271571,0.215534,-0.051617,-0.083200,0.140492,-0.045176,0.281804,0.024921,0.037777,...,0.553023,-0.149797,0.425991,-0.147488,-0.218458,0.299213,0.466869,-0.127596,0.423303,0.073130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30078,-0.057766,0.534434,0.078688,0.108312,0.194719,0.133313,0.076421,0.149076,0.127926,-0.080206,...,0.381842,-0.090594,0.357778,-0.138506,-0.131539,0.060329,0.879204,0.100835,0.299556,0.220035
30079,-0.116204,-0.021042,0.059986,-0.098724,-0.132803,0.080919,-0.158249,0.273661,0.007932,-0.158452,...,0.152788,-0.164298,0.494543,-0.111861,-0.191273,0.219035,0.041929,-0.065143,0.029123,0.231337
30080,-0.064648,0.428484,-0.025759,-0.100176,-0.231685,0.123197,0.082175,0.046571,0.167706,-0.004723,...,0.221434,-0.137481,0.269945,0.005036,-0.252102,0.196983,-0.048806,-0.153831,0.412776,0.585521
30081,0.335530,0.154176,0.348478,0.058384,-0.039007,0.251613,-0.002622,0.030806,0.233821,0.127677,...,0.550035,0.038085,0.252633,-0.123222,-0.154719,0.016147,0.176843,-0.006309,0.308135,0.190591


In [5]:
%%time
n_components=75
pca=PCA(n_components=n_components)
pca.fit(df_features)
pickle.dump(pca,open(f"{models_dir}pca_model_{n_components}.pickle","wb"))
features_pca=pca.transform(df_features)
pd.concat([df_uid,pd.DataFrame(features_pca)],axis=1).to_csv(f"{models_dir}features_pca_{n_components}.csv",index=False)

Wall time: 6.65 s


累積寄与率を出力する

In [21]:
ev_ratio=pca.explained_variance_ratio_
ev_ratio=np.hstack([0,ev_ratio.cumsum()])
pd.DataFrame(ev_ratio).to_csv("df_pca.csv")

## PCAで得られた特徴量をLightGBMにかける

### CV

In [3]:
train=pd.read_csv(dataset_dir+"train.csv")

group_kfold=GroupKFold(n_splits=n_folds)
folds=train.copy()

for n,(train_index,test_index) in enumerate(group_kfold.split(train,groups=train["PatientID"].values)):
    folds.loc[test_index,"fold"]=n

folds["fold"]=folds["fold"].astype(int)

### Optunaでハイパーパラメータを最適化する

In [9]:
pca_count=75
features_pca=pd.read_csv(f"{models_dir}features_pca_{pca_count}.csv").sort_values("StudyInstanceUID")

dataset=pd.merge(folds,features_pca,on="StudyInstanceUID")

fold=0

dataset_train=dataset[dataset["fold"]!=fold]
dataset_test=dataset[dataset["fold"]==fold]
X_train=dataset_train.iloc[:,-pca_count:]
X_test=dataset_test.iloc[:,-pca_count:]

In [12]:
%%time

def optimize_params():
    for i in range(1,12):
        y_train=dataset_train.iloc[:,i]
        y_test=dataset_test.iloc[:,i]

        lgb_train=lgb.Dataset(X_train,label=y_train)
        lgb_test=lgb.Dataset(X_test,label=y_test,reference=lgb_train)
        
        params={
            "task":"train",
            "boosting_type":"gbdt",
            "objective":"binary",
            "metric":"auc",
            # "early_stopping_rounds":200, #early_stopping_roundsを指定しないとbest_iterationは保存されない
        }

        opt=lgb.train(params,lgb_train,valid_sets=lgb_test,num_boost_round=100,verbose_eval=50)
        pickle.dump(opt.params,open(f"{models_dir}pca_{pca_count}/groupkfold/lgb_params_{i}.pickle","wb"))

optimize_params()

verhead of testing was 0.012525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19125
[LightGBM] [Info] Number of data points in the train set: 22562, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026460 -> initscore=-3.605288
[LightGBM] [Info] Start training from score -3.605288
[50]	valid_0's auc: 0.831762
regularization_factors, val_score: 0.852567:  25%|##5       | 5/20 [00:05<00:16,  1.08s/it][32m[I 2021-02-19 19:19:34,268][0m Trial 47 finished with value: 0.8323948356519978 and parameters: {'lambda_l1': 0.0009363080505649153, 'lambda_l2': 4.224612968862878}. Best is trial 45 with value: 0.852566745028573.[0m
regularization_factors, val_score: 0.852567:  25%|##5       | 5/20 [00:05<00:16,  1.08s/it][100]	valid_0's auc: 0.832395
[LightGBM] [Info] Number of positive: 597, number of negative: 21965
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19125
[LightGBM] [

### PCA - 200
最適化に55min 54s

### pca - 100
最適化に26min 18s  
31min 51s

### pca - 75
最適化に20min 2s  
23min 9s
16min 27s

### pca - 50
最適化に13min 58s

### pca - 5
最適化に3min 54s

### 得られたパラメータを用いて予測する

In [13]:
import lightgbm

def get_pred(train,test,col_name:str,pca_count:int):
    X_train=train.iloc[:,-pca_count:]
    X_test=test.iloc[:,-pca_count:]
    y_train=train[col_name]
    y_test=test[col_name]

    col_index=test.columns.get_loc(col_name)

    lgb_train=lgb.Dataset(X_train,label=y_train)
    lgb_test=lgb.Dataset(X_test,label=y_test,reference=lgb_train)

    params=pickle.load(open(f"{models_dir}pca_{pca_count}/groupkfold/lgb_params_{col_index}.pickle","rb"))
    params["early_stopping_rounds"]=100
    params["verbose"]=-1

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=200)
    pred=model.predict(X_test)
    auc=roc_auc_score(y_test,pred)

    return pred,auc


results=pd.DataFrame(columns=target_cols)

features_pca=pd.read_csv(f"{models_dir}features_pca_{pca_count}.csv")
dataset=pd.merge(folds,features_pca,on="StudyInstanceUID")


for n in range(n_folds):
    train_n=dataset[dataset["fold"]!=n]
    test_n=dataset[dataset["fold"]==n]


    for col_name in target_cols:
        pred,auc=get_pred(train=train_n,test=test_n,col_name=col_name,pca_count=pca_count)
        
        results.loc[f"pca_{pca_count} - {n}",col_name]=auc


display(results)
display(results.mean(axis=1),results.mean(axis=1).mean())

Unnamed: 0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
pca_75 - 0,0.808277,0.806389,0.874581,0.744426,0.699056,0.824968,0.858151,0.596288,0.586559,0.553155,0.852567
pca_75 - 1,0.6724,0.83563,0.858072,0.666539,0.693132,0.824185,0.866158,0.573148,0.562599,0.546986,0.83957
pca_75 - 2,0.733085,0.814242,0.872941,0.747333,0.72347,0.845974,0.842066,0.571967,0.580694,0.541147,0.832271
pca_75 - 3,0.618248,0.809592,0.885922,0.701075,0.73238,0.842144,0.857316,0.572106,0.583416,0.546906,0.838534


pca_75 - 0    0.745856
pca_75 - 1    0.721675
pca_75 - 2    0.736836
pca_75 - 3    0.726149
dtype: float64

0.7326286997407299

## モデルを保存する

In [13]:
import lightgbm


for col_name in target_cols:
    y_train=dataset_train.loc[:,col_name]
    y_test=dataset_test.loc[:,col_name]

    col_index=dataset_train.columns.get_loc(col_name)

    lgb_train=lgb.Dataset(X_train,label=y_train)
    lgb_test=lgb.Dataset(X_test,label=y_test,reference=lgb_train)
    
    params=pickle.load(open(f"{models_dir}pca_{pca_count}/groupkfold/lgb_params_{col_index}.pickle","rb"))    
    params["early_stopping_rounds"]=100
    params["verbose"]=-1

    model=lightgbm.train(params,lgb_train,valid_sets=lgb_test,verbose_eval=200)
    pickle.dump(model,open(f"{models_dir}pca_{pca_count}/groupkfold/lgb_model_{col_index}.pickle","wb"))

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.808758
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[95]	valid_0's auc: 0.807283
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[89]	valid_0's auc: 0.874637
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[95]	valid_0's auc: 0.746204
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.699056
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[99]	valid_0's auc: 0.825209
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.858151
Training until validation