In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train = train.drop(columns=['분석데이터'], axis=1)

In [4]:
train

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,144,12.298611,1771,5.356616,0,0,0,1,2399,...,10,4,10,9,4,0,1,0,0,0
1,1,804,9.580846,7703,6.063542,0,0,0,6,183376,...,43,121,84,78,47,36,40,45,27,36
2,0,2205,12.736054,28083,6.107050,9,0,0,6,1178,...,326,268,239,286,199,148,154,37,48,36
3,0,2602,10.288240,26770,5.373013,8,0,0,1,56851,...,336,230,206,245,76,0,26,702,1,5
4,1,8980,23.252339,208806,5.775223,0,28,16,3,124274,...,731,882,1171,1010,322,64,327,84,75,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,2018,13.938057,28127,5.940442,0,70,0,11,255044,...,246,186,206,235,88,33,81,58,61,72
9996,0,1105,16.437104,18163,5.766962,0,11,0,3,181296,...,199,57,134,123,20,25,28,25,41,13
9997,0,4,58.500000,234,3.811827,0,0,0,1,68736,...,0,0,0,0,0,0,0,0,0,0
9998,1,3312,24.939312,82599,5.834730,0,39,0,8,90648,...,438,985,806,851,113,123,181,100,75,86


In [5]:
train_df_x = train.drop(['label'], axis=1)

In [6]:
train_df_y = train['label']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [8]:
xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'tree_method': 'gpu_hist',
               'gpu_id': 0,
               'predictor': 'gpu_predictor',
               'random_state': 42 }

lgb_params = {'n_estimators': 10000,
              'learning_rate':0.09416659111369403,
              'max_depth':43,
              'boosting':'gbdt',
              'objective': 'binary',
              'metric': 'binary_logloss',
              'is_training_metric': True,
              'num_leaves':41,
              'min_data_in_leaf':10,
              'feature_fraction':0.8,
              'bagging_fraction':0.9,
              'bagging_freq':0,
              'alpha': 0.019782149081578264 }

cat_params = {'objective': 'CrossEntropy',
              'colsample_bylevel': 0.043529438827711514,
              'depth': 12,
              'boosting_type': 'Ordered',
              'bootstrap_type': 'Bernoulli',
              'learning_rate': 0.19719860541901787,
              'iterations': 205,
              'random_strength': 34,
              'od_type': 'IncToDec',
              'subsample': 0.9558805603499683
             }

In [14]:
lgb_params = {'n_estimators': 10000,
              'learning_rate':0.01
             }

xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.01,
               'max_depth': 12}

In [15]:
lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [None]:
rf = RandomForestClassifier(n_estimators= 1000)

In [16]:
pred_lgbm = lgbm.fit(x_train, y_train).predict(x_test)

In [None]:
pred_xgb = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
pred_rf = rf.fit(x_train, y_train).predict(x_test)

In [17]:
accuracy_score(y_test, pred_lgbm)

0.9285

LGBM
1. optuna -> 0.925  
2. 'n_estimators': 10000, 'learning_rate': 0.01 -> 0.9285  
3. 'n_estimators': 20000. 'learning_rate': 0.01 -> 0.9285

In [None]:
accuracy_score(y_test, pred_xgb)

XGBoost
1. optuna -> 0.9205 
2. 'n_estimators': 10000, 'learning_rate': 0.01, 'max_depth': 12 ->  

In [None]:
accuracy_score(y_test, pred_rf)

## Soft Voting (LGBM + XGB + RF)

from sklearn.ensemble import VotingClassifier

In [None]:
voting_estimators = [ ('LGBM', lgbm),
                    ('XGB', xgb),
                    ('RandomForest', rf)]

In [None]:
voting_model = VotingClassifier(estimators= voting_estimators, voting='hard')

In [None]:
voting_model.fit(x_train, y_train)
pred_voting = voting_model.predict(x_test)

In [None]:
accuracy_score(y_test, pred_voting)

Result  
1. Optuna & voting -> 0.929  
2. based & voting (soft) -> 0.9285
3. based & voting (hard) -> 0.9265

## Remove_Outlier (LGBM)

In [None]:
ft_importance_values = lgbm.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
train = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')
train = train.drop(columns=['분석데이터'], axis=1)
train_df_x = train.drop(['label'], axis=1)
train_df_y = train['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [None]:
x_train

In [None]:
outlier_data = pd.concat([x_train, y_train], axis=1)

In [None]:
outlier_data.shape

In [None]:
outlier_data

In [None]:
f, axes = plt.subplots(ncols=2, figsize=(15,8))

sns.boxplot(x='label', y='b_0',data=outlier_data, ax=axes[0])
axes[0].set_title('b_0 vs Label')

sns.boxplot(x='label', y='a_89',data=outlier_data, ax=axes[1])
axes[1].set_title('a_89 vs Label')

In [None]:
import numpy as np

def get_outlier(df=None, column=None, weight=1.5):
  # target 값과 상관관계가 높은 열을 우선적으로 진행
    quantile_25 = np.percentile(df[column].values, 25)
    quantile_75 = np.percentile(df[column].values, 75)
    
    IQR = quantile_75 - quantile_25
    IQR_weight = IQR*weight
    
    lowest = quantile_25 - IQR_weight
    highest = quantile_75 + IQR_weight
  
    outlier_idx = df[column][ (df[column] < lowest) | (df[column] > highest) ].index
    return outlier_idx

In [None]:
outlier_idx = get_outlier(df=outlier_data, column='a_89', weight=1.5)

In [None]:
outlier_data.drop(outlier_idx, axis=0, inplace=True)

In [None]:
outlier_idx = get_outlier(df=outlier_data, column='b_0', weight=1.5)

In [None]:
outlier_data.drop(outlier_idx, axis=0, inplace=True)

In [None]:
x_train = outlier_data.drop(columns=['label'], axis=1)

In [None]:
y_train = outlier_data['label']

In [None]:
x_train.shape, y_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
x_train_scaled

In [None]:
x_test_scaled

In [None]:
pred_outlier = lgbm.fit(x_train_scaled, y_train).predict(x_test_scaled)

In [None]:
accuracy_score(y_test, pred_outlier)