In [1]:
from datetime import datetime
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import optuna

In [2]:
train = pd.read_csv('preprocessed_train.csv', encoding='euc-kr')
test = pd.read_csv('preprocessed_test.csv', encoding='euc-kr')

In [3]:
train['std'] = train.std(axis=1)
train['min'] = train.min(axis=1)
train['max'] = train.max(axis=1)

test['std'] = test.std(axis=1)
test['min'] = test.min(axis=1)
test['max'] = test.max(axis=1)

In [4]:
train.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95,std,min,max
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,9,4,0,1,0,0,0,2082.997408,0.0,18387.0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,78,47,36,40,45,27,36,15881.121207,0.0,201546.0
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,286,199,148,154,37,48,36,1142.536245,0.0,28083.0
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,245,76,0,26,702,1,5,4666.921928,0.0,56851.0
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,1010,322,64,327,84,75,244,12901.337801,0.0,208806.0


In [5]:
test.head()

Unnamed: 0,분석대상,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95,std,min,max
0,1,5063,9.419514,47691,5.630504,6,1,1,11,68704,...,312,128,159,135,283,101,277,8369.698786,0.0,167918.0
1,2,5347,15.560875,83204,5.773314,0,2,14,4,4026,...,574,90,25,47,22,26,27,5760.84015,0.0,83204.0
2,3,4523,11.875083,53711,6.146246,0,37,0,1,65732,...,289,115,90,248,95,118,77,5623.903019,0.0,94208.0
3,4,6174,7.378037,45552,6.473256,0,1,17,13,30028,...,469,331,272,457,311,320,291,2815.812293,0.0,45552.0
4,5,22,7.090909,156,5.32463,0,0,0,1,22922,...,0,0,0,0,0,0,0,2590.244982,0.0,36864.0


In [6]:
y = train['label']
train.drop(columns = ['분석데이터', 'label'], inplace = True)
test.drop(columns = '분석대상', inplace = True)

In [7]:
def Stacking_Data_Loader(model, model_name, train, y, test, fold):
    '''
    Put your train, test datasets and fold value!
    This function returns train, test datasets for stacking ensemble :)
    '''

    stk = StratifiedKFold(n_splits = fold, random_state = 42, shuffle = True)
    
    # Declaration Pred Datasets
    train_fold_pred = np.zeros((train.shape[0], 1))
    test_pred = np.zeros((test.shape[0], fold))
    
    for counter, (train_index, valid_index) in enumerate(stk.split(train, y)):
        x_train, y_train = train.iloc[train_index], y[train_index]
        x_valid, y_valid = train.iloc[valid_index], y[valid_index]

        print('------------ Fold', counter+1, 'Start! ------------')
        if model_name == 'cat':
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
        elif model_name == 'xgb':
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric = 'auc', verbose = 500, early_stopping_rounds = 200)
        else:
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric = 'auc', verbose = 500, early_stopping_rounds = 200)
            
        print('------------ Fold', counter+1, 'Done! ------------')
        
        train_fold_pred[valid_index, :] = model.predict_proba(x_valid)[:, 1].reshape(-1, 1)
        test_pred[:, counter] = model.predict_proba(test)[:, 1]
        
        del x_train, y_train, x_valid, y_valid
        gc.collect()
        
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1, 1)
    
    del test_pred
    gc.collect()
    
    print('Done!')
    
    return train_fold_pred, test_pred_mean

In [19]:
xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'tree_method': 'gpu_hist',
               'gpu_id': 0,
               'predictor': 'gpu_predictor',
               'random_state': 42}

lgb_params = {
    
}

cat_params = {
    
}

cnn_params


In [20]:
lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [21]:
xgb_train, xgb_test = Stacking_Data_Loader(xgb, 'xgb', train, y, test, 5)

------------ Fold 1 Start! ------------
[0]	validation_0-auc:0.84537
[500]	validation_0-auc:0.97508
[1000]	validation_0-auc:0.97808
[1500]	validation_0-auc:0.97879
[2000]	validation_0-auc:0.97901
[2306]	validation_0-auc:0.97908
------------ Fold 1 Done! ------------
------------ Fold 2 Start! ------------
[0]	validation_0-auc:0.83578
[500]	validation_0-auc:0.96850
[1000]	validation_0-auc:0.97155
[1500]	validation_0-auc:0.97234
[2000]	validation_0-auc:0.97268
[2419]	validation_0-auc:0.97280
------------ Fold 2 Done! ------------
------------ Fold 3 Start! ------------
[0]	validation_0-auc:0.85046
[500]	validation_0-auc:0.96904
[1000]	validation_0-auc:0.97337
[1500]	validation_0-auc:0.97469
[2000]	validation_0-auc:0.97534
[2500]	validation_0-auc:0.97562
[3000]	validation_0-auc:0.97587
[3500]	validation_0-auc:0.97610
[4000]	validation_0-auc:0.97628
[4243]	validation_0-auc:0.97626
------------ Fold 3 Done! ------------
------------ Fold 4 Start! ------------
[0]	validation_0-auc:0.87484
[5