In [36]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [68]:
robustScaler = RobustScaler()

In [2]:
train_df = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train_df.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244


In [4]:
train_df.dtypes

분석데이터           int64
label           int64
numstrings      int64
avlength      float64
printables      int64
               ...   
dist_91         int64
dist_92         int64
dist_93         int64
dist_94         int64
dist_95         int64
Length: 618, dtype: object

In [5]:
train_df2 = train_df.drop(columns=['분석데이터'], axis=1)

In [6]:
corr_df = train_df2.corr()

In [7]:
corr_df = corr_df.apply(lambda x: round(x,3))

In [8]:
corr_df

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
label,1.000,-0.006,-0.013,-0.021,0.006,0.008,-0.014,-0.008,0.000,0.096,...,-0.015,-0.011,-0.018,-0.018,-0.009,-0.006,0.005,0.001,0.010,0.011
numstrings,-0.006,1.000,-0.015,0.454,0.248,0.087,0.124,0.108,0.419,-0.002,...,0.208,0.167,0.244,0.296,0.059,0.518,0.816,0.580,0.790,0.751
avlength,-0.013,-0.015,1.000,0.633,-0.038,-0.001,-0.004,-0.005,0.694,-0.001,...,0.728,0.670,0.633,0.562,0.327,-0.009,-0.013,-0.010,-0.015,-0.014
printables,-0.021,0.454,0.633,1.000,-0.040,0.068,0.119,0.077,0.572,0.003,...,0.929,0.829,0.936,0.943,0.752,0.167,0.311,0.193,0.245,0.234
entropy,0.006,0.248,-0.038,-0.040,1.000,-0.000,0.018,0.012,0.169,-0.002,...,-0.049,-0.101,-0.066,-0.082,-0.126,0.159,0.255,0.199,0.310,0.281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dist_91,-0.006,0.518,-0.009,0.167,0.159,0.016,0.010,0.014,0.301,-0.007,...,0.092,0.082,0.091,0.088,0.017,1.000,0.516,0.437,0.588,0.571
dist_92,0.005,0.816,-0.013,0.311,0.255,0.036,0.044,0.044,0.391,-0.006,...,0.155,0.137,0.173,0.194,0.039,0.516,1.000,0.593,0.822,0.769
dist_93,0.001,0.580,-0.010,0.193,0.199,0.020,0.022,0.018,0.329,-0.007,...,0.112,0.103,0.109,0.113,0.028,0.437,0.593,1.000,0.684,0.664
dist_94,0.010,0.790,-0.015,0.245,0.310,0.018,0.017,0.018,0.450,-0.008,...,0.146,0.135,0.144,0.141,0.034,0.588,0.822,0.684,1.000,0.926


In [9]:
corr_df.unstack()

label    label         1.000
         numstrings   -0.006
         avlength     -0.013
         printables   -0.021
         entropy       0.006
                       ...  
dist_95  dist_91       0.571
         dist_92       0.769
         dist_93       0.664
         dist_94       0.926
         dist_95       1.000
Length: 380689, dtype: float64

In [10]:
unstacked = corr_df.unstack()

In [11]:
corr_df2 = corr_df.nsmallest(5, 'label')

In [12]:
corr_df2

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
b_40,-0.045,-0.01,-0.001,-0.006,-0.004,-0.001,-0.001,-0.007,-0.007,0.024,...,-0.004,-0.004,-0.004,-0.004,-0.003,-0.008,-0.01,-0.009,-0.01,-0.01
b_12,-0.044,-0.003,-0.002,-0.006,0.013,-0.001,-0.009,-0.007,-0.004,0.024,...,-0.006,-0.007,-0.005,-0.005,-0.006,-0.004,-0.004,-0.004,-0.005,-0.005
b_60,-0.039,-0.0,-0.002,-0.006,0.019,-0.001,-0.006,-0.003,-0.002,0.025,...,-0.004,-0.005,-0.005,-0.007,-0.006,0.003,0.002,-0.001,0.003,0.0
b_56,-0.038,0.015,-0.0,0.016,-0.005,-0.001,-0.003,-0.006,-0.002,0.025,...,0.009,0.007,0.014,0.019,0.011,-0.001,0.008,-0.001,0.002,-0.001
b_55,-0.03,-0.004,-0.001,0.005,0.002,-0.001,-0.002,-0.002,-0.004,0.047,...,0.008,0.007,0.009,0.007,0.014,-0.005,-0.005,-0.005,-0.002,-0.003


In [13]:
corr_df2[list(corr_df2.index)]

Unnamed: 0,b_40,b_12,b_60,b_56,b_55
b_40,1.0,0.018,0.052,0.586,0.256
b_12,0.018,1.0,0.193,0.005,0.007
b_60,0.052,0.193,1.0,0.022,0.049
b_56,0.586,0.005,0.022,1.0,0.164
b_55,0.256,0.007,0.049,0.164,1.0


In [14]:
corr_df3 = corr_df.nlargest(10, 'label')

In [15]:
corr_df3

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
label,1.0,-0.006,-0.013,-0.021,0.006,0.008,-0.014,-0.008,0.0,0.096,...,-0.015,-0.011,-0.018,-0.018,-0.009,-0.006,0.005,0.001,0.01,0.011
a_236,0.246,0.0,-0.005,-0.003,-0.017,0.002,-0.015,-0.012,0.002,0.066,...,-0.002,-0.007,-0.001,-0.0,0.002,-0.001,0.001,-0.002,0.002,0.003
a_216,0.241,-0.003,-0.005,-0.005,-0.016,-0.003,-0.015,-0.012,0.001,0.049,...,-0.003,-0.007,-0.003,-0.002,0.001,-0.003,-0.002,-0.006,-0.001,0.0
a_233,0.236,-0.003,-0.005,-0.002,-0.017,-0.001,-0.016,-0.014,-0.001,0.069,...,0.0,-0.004,0.001,0.002,0.006,-0.004,-0.003,-0.006,-0.002,-0.001
b_94,0.22,-0.011,-0.0,-0.007,-0.01,-0.003,-0.012,-0.007,0.003,0.009,...,-0.003,-0.006,-0.003,-0.004,0.001,-0.007,-0.011,-0.013,-0.01,-0.011
a_70,0.219,-0.001,-0.007,-0.008,-0.014,0.002,-0.009,-0.004,-0.004,0.056,...,-0.009,-0.008,-0.009,-0.009,-0.008,-0.002,0.003,-0.003,-0.0,0.006
b_93,0.216,-0.01,-0.0,-0.008,-0.01,-0.003,-0.013,-0.007,0.003,0.009,...,-0.004,-0.007,-0.005,-0.005,-0.002,-0.007,-0.01,-0.013,-0.009,-0.01
a_67,0.215,0.003,-0.007,-0.005,-0.012,-0.001,-0.01,-0.006,-0.003,0.082,...,-0.007,-0.008,-0.006,-0.006,-0.006,-0.0,0.005,0.001,0.004,0.009
a_86,0.214,0.001,-0.008,-0.007,-0.013,0.004,-0.009,-0.004,-0.005,0.065,...,-0.009,-0.009,-0.008,-0.008,-0.009,-0.003,0.003,-0.003,0.0,0.007
a_50,0.213,0.008,-0.007,-0.005,-0.012,0.001,-0.007,-0.002,-0.002,0.029,...,-0.01,-0.009,-0.009,-0.009,-0.012,0.001,0.008,0.001,0.004,0.012


In [16]:
cp_train_df2 = train_df2.copy()

In [17]:
def get_outlier1(df, column):
    fraud_column_data = df[df['label']==0][column]
    quantile_25 = np.percentile(fraud_column_data.values, 25)
    quantile_75 = np.percentile(fraud_column_data.values, 75)
    
    IQR = quantile_75 - quantile_25
    IQR_weight = IQR * 1.5
    
    lowest = quantile_25 - IQR_weight
    highest = quantile_75 + IQR_weight
    
    outlier_idx = fraud_column_data[ (fraud_column_data < lowest) | (fraud_column_data > highest) ].index
    return outlier_idx

In [18]:
def get_outlier2(df, column):
    fraud_column_data = df[df['label']==1][column]
    quantile_25 = np.percentile(fraud_column_data.values, 25)
    quantile_75 = np.percentile(fraud_column_data.values, 75)
    
    IQR = quantile_75 - quantile_25
    IQR_weight = IQR * 1.5
    
    lowest = quantile_25 - IQR_weight
    highest = quantile_75 + IQR_weight
    
    outlier_idx = fraud_column_data[ (fraud_column_data < lowest) | (fraud_column_data > highest) ].index
    return outlier_idx

In [19]:
# outlier_idx = get_outlier1(df=cp_train_df2, column='b_40')

In [20]:
# len(outlier_idx)

In [21]:
# cp_train_df2.shape

In [22]:
# cp_train_df2.drop(outlier_idx, axis = 0, inplace=True)

In [23]:
# cp_train_df2.shape

In [24]:
# cp_train_df2_x = cp_train_df2.drop(['label'], axis=1)

In [25]:
# cp_train_df2_y = cp_train_df2['label']

In [26]:
# x_train, x_test, y_train, y_test = train_test_split(cp_train_df2_x, cp_train_df2_y, test_size=0.2, random_state=42)

In [27]:
train_df_x = train_df2.drop(['label'], axis=1)

In [28]:
train_df_y = train_df2['label']

In [29]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [42]:
xgb_final = xgb.XGBClassifier(silent=False,
                              n_estimators = 1000,
                              booster='gbtree',
                              tree_method='gpu_hist',
                              preidctor= 'gpu_predictor',
                              scale_pos_weight=1,
                              learning_rate=0.01,
                              objective='binary:logistic',
                              max_depth = 10,
                              subsample = 0.8,
                              colsample_bytree = 0.8,
                              min_child_weight = 1,
                              gamma = 0.5,
                              seed=42)

In [43]:
xgb_final

XGBClassifier(base_score=None, booster='gbtree', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, gamma=0.5,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=None, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              preidctor='gpu_predictor', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=42, silent=False,
              subsample=0.8, tree_method='gpu_hist', validate_parameters=None,
              verbosity=None)

In [32]:
final_pred = xgb_final.fit(x_train, y_train).predict(x_test)

Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [33]:
print(classification_report(y_test, final_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       900
           1       0.92      0.94      0.93      1100

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



In [34]:
accuracy_score(y_test, final_pred)

0.924

In [35]:
confusion_matrix(y_test, final_pred)

array([[ 816,   84],
       [  68, 1032]], dtype=int64)

In [52]:
from vecstack import StackingTransformer

In [61]:
xgb = xgb_final
knn = KNeighborsClassifier(n_neighbors=3)
forest = RandomForestClassifier(n_estimators=1000, random_state=42)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

In [62]:
estimators = [
    ('KNN' , knn ),
    ('RandomForest' , forest),
    ('DecisionTree' , dt_clf),
    ('XGBoost' , xgb),
    ('AdaBoost', ada_clf)
]

In [63]:
stack = StackingTransformer(estimators, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0) 

In [64]:
stack = stack.fit(x_train, y_train)

Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters a

In [65]:
s_train = stack.transform(x_train)
s_test = stack.transform(x_test)

In [67]:
s_model = xgb.fit(s_train, y_train)
s_model_prediction =  s_model.predict(s_test)
accuracy = round(accuracy_score(y_test, s_model_prediction) * 100, 5) 
print("Accuracy : ", accuracy, "%") 

Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy :  92.5 %
