In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train = train.drop(columns=['분석데이터'], axis=1)

In [4]:
train

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,144,12.298611,1771,5.356616,0,0,0,1,2399,...,10,4,10,9,4,0,1,0,0,0
1,1,804,9.580846,7703,6.063542,0,0,0,6,183376,...,43,121,84,78,47,36,40,45,27,36
2,0,2205,12.736054,28083,6.107050,9,0,0,6,1178,...,326,268,239,286,199,148,154,37,48,36
3,0,2602,10.288240,26770,5.373013,8,0,0,1,56851,...,336,230,206,245,76,0,26,702,1,5
4,1,8980,23.252339,208806,5.775223,0,28,16,3,124274,...,731,882,1171,1010,322,64,327,84,75,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,2018,13.938057,28127,5.940442,0,70,0,11,255044,...,246,186,206,235,88,33,81,58,61,72
9996,0,1105,16.437104,18163,5.766962,0,11,0,3,181296,...,199,57,134,123,20,25,28,25,41,13
9997,0,4,58.500000,234,3.811827,0,0,0,1,68736,...,0,0,0,0,0,0,0,0,0,0
9998,1,3312,24.939312,82599,5.834730,0,39,0,8,90648,...,438,985,806,851,113,123,181,100,75,86


In [5]:
train_df_x = train.drop(['label'], axis=1)

In [6]:
train_df_y = train['label']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [9]:
x = scaler.transform(train_df_x)

In [11]:
features = train_df_x.columns.values

In [12]:
x = pd.DataFrame(data=x, columns=features)

In [13]:
x

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,-0.455925,-0.106437,-0.397631,-0.525787,-0.024564,-0.263464,-0.180745,-0.290310,-0.191748,-0.390284,...,-0.318129,-0.291736,-0.309491,-0.305251,-0.194456,-0.247290,-0.390138,-0.264375,-0.396086,-0.374704
1,-0.415236,-0.124612,-0.377245,0.430533,-0.024564,-0.263464,-0.180745,-0.199413,-0.005651,-0.143875,...,-0.308489,-0.263467,-0.293106,-0.292545,-0.190753,-0.217074,-0.347790,-0.223377,-0.359557,-0.328061
2,-0.328864,-0.103512,-0.307209,0.489389,0.055828,-0.263464,-0.180745,-0.199413,-0.193003,-0.407241,...,-0.225815,-0.227950,-0.258784,-0.254242,-0.177662,-0.123071,-0.224003,-0.230666,-0.331145,-0.328061
3,-0.304389,-0.119881,-0.311721,-0.503605,0.046896,-0.263464,-0.180745,-0.290310,-0.135755,-0.326289,...,-0.222894,-0.237131,-0.266091,-0.261792,-0.188255,-0.247290,-0.362992,0.375192,-0.394733,-0.368225
4,0.088814,-0.033188,0.313850,0.040499,-0.024564,0.440422,6.885656,-0.253951,-0.066425,-0.228006,...,-0.107501,-0.079600,-0.052412,-0.120917,-0.167069,-0.193573,-0.036152,-0.187846,-0.294616,-0.058574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.340393,-0.095474,-0.307058,0.264004,-0.024564,1.496249,-0.180745,-0.108515,0.068045,0.138350,...,-0.249186,-0.247762,-0.266091,-0.263633,-0.187222,-0.219592,-0.303270,-0.211533,-0.313557,-0.281419
9996,-0.396679,-0.078763,-0.341299,0.029323,-0.024564,0.013063,-0.180745,-0.253951,-0.007790,-0.248299,...,-0.262916,-0.278930,-0.282034,-0.284258,-0.193078,-0.226307,-0.360820,-0.241599,-0.340616,-0.357861
9997,-0.464556,0.202520,-0.402913,-2.615553,-0.024564,-0.263464,-0.180745,-0.290310,-0.123534,-0.335049,...,-0.321050,-0.292702,-0.311706,-0.306909,-0.194800,-0.247290,-0.391224,-0.264375,-0.396086,-0.374704
9998,-0.260618,-0.021907,-0.119864,0.120998,-0.024564,0.716948,-0.180745,-0.163054,-0.101002,-0.366407,...,-0.193096,-0.054714,-0.133233,-0.150197,-0.185068,-0.144054,-0.194685,-0.173269,-0.294616,-0.263281


In [None]:
xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'tree_method': 'gpu_hist',
               'gpu_id': 0,
               'predictor': 'gpu_predictor',
               'random_state': 42 }

lgb_params = {'n_estimators': 10000,
              'learning_rate':0.09416659111369403,
              'max_depth':43,
              'boosting':'gbdt',
              'objective': 'binary',
              'metric': 'binary_logloss',
              'is_training_metric': True,
              'num_leaves':41,
              'min_data_in_leaf':10,
              'feature_fraction':0.8,
              'bagging_fraction':0.9,
              'bagging_freq':0,
              'alpha': 0.019782149081578264 }

cat_params = {'objective': 'CrossEntropy',
              'colsample_bylevel': 0.043529438827711514,
              'depth': 12,
              'boosting_type': 'Ordered',
              'bootstrap_type': 'Bernoulli',
              'learning_rate': 0.19719860541901787,
              'iterations': 205,
              'random_strength': 34,
              'od_type': 'IncToDec',
              'subsample': 0.9558805603499683
             }

In [None]:
lgb_params = {'n_estimators': 10000,
              'learning_rate':0.03,
              'boosting': 'goss',
              'objective': 'binary'
             }

xgb_params = {'n_estimators': 3000,
              'learning_rate': 0.045,
              'max_depth': 21,
              'predictor': 'gpu_predictor',
              'tree_method': 'gpu_hist',
              'objective': 'binary:logistic',
              'eval_metric': 'error',
              'booster': 'dart',
              'rate_drop': 0.2,
              'skip_drop': 0.7,
             }

In [None]:
lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [None]:
rf = RandomForestClassifier(n_estimators= 1000)

In [None]:
# pred_lgbm = lgbm.fit(x_train_scaled, y_train).predict(x_test_scaled)

In [None]:
pred_xgb = xgb.fit(x_train_scaled, y_train).predict(x_test_scaled)

In [None]:
# pred_rf = rf.fit(x_train_scaled, y_train).predict(x_test_scaled)

In [None]:
accuracy_score(y_test, pred_lgbm)

LGBM
1. optuna -> 0.925  
2. 'n_estimators': 10000, 'learning_rate': 0.01 -> 0.9285  
3. 'n_estimators': 20000. 'learning_rate': 0.01 -> 0.9285  
.  
4. 'n_estimators': 10000,  
    'learning_rate':0.03,  
    'boosting': 'goss'  
    => 0.9295

In [None]:
accuracy_score(y_test, pred_xgb)

In [None]:
sub_data = pd.read_csv('preprocessed_test.csv', encoding='euc-kr')

In [None]:
sub_data = sub_data.drop(columns=['분석대상'], axis=1)

In [None]:
sub_data_scaled = scaler.transform(sub_data)

In [None]:
temp_pred = xgb.predict(sub_data_scaled)

In [None]:
sub = pd.read_csv('submission_ex.csv', encoding='euc-kr')
sub['정답지'] = temp_pred
sub.to_csv('sub_xgb_dart.csv', index = 0, encoding='euc-kr')

In [None]:
sub_rf = pd.read_csv('rf_sub.csv', encoding='euc-kr')

In [None]:
compare = pd.DataFrame(columns=['status'])

In [None]:
sub_xgb_dart = pd.read_csv('sub_xgb_dart.csv', encoding='euc-kr')

In [None]:
for i in range(5000):
    if sub_xgb_dart['정답지'][i] == sub_rf['정답지'][i]:
        compare.loc[i] = 0
    else:
        compare.loc[i] = 1

In [None]:
(compare['status']==0).sum()

In [None]:
(compare['status']==1).sum()

In [None]:
(sub_rf['정답지'] == 1).sum()

In [None]:
(sub_rf['정답지'] == 0).sum()

In [None]:
(sub_xgb_dart['정답지'] == 1).sum()

In [None]:
(sub_xgb_dart['정답지'] == 0).sum()

In [None]:
filename = 'xgb_dart.model'

In [None]:
xgb.save_model(filename)

In [139]:
xgb_params = {'n_estimators': 3000,
              'learning_rate': 0.045,
              'max_depth': 21,
              'predictor': 'gpu_predictor',
              'tree_method': 'gpu_hist',
              'objective': 'binary:logistic',
              'eval_metric': 'error'
             }

rf_params = {
    'n_estimators': 1400,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth':80,
    'bootstrap': 'False',
    'n_jobs': -1
}

lgbm_params = {
    'max_bin' : 400,
    'n_estimators': 5000,
    'learning_rate':0.09416659111369403,
    'max_depth':-1,
    'boosting':'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_training_metric': True,
    'num_leaves':60,
    'min_data_in_leaf':10,
    'feature_fraction':0.5,
    'bagging_fraction':0.3,
    'bagging_freq':0,
}

In [130]:
xgb = XGBClassifier(**xgb_params)

In [131]:
rf = RandomForestClassifier(**rf_params)

In [140]:
lgbm = LGBMClassifier(**lgbm_params)

In [133]:
pred_xgb = xgb.fit(x_train_scaled, y_train).predict(x_test_scaled)



In [134]:
accuracy_score(y_test, pred_xgb)

0.9305

In [153]:
confusion_matrix(y_test, pred_xgb)

array([[ 829,   71],
       [  68, 1032]], dtype=int64)

In [135]:
pred_rf = rf.fit(x_train_scaled, y_train).predict(x_test_scaled)

In [136]:
accuracy_score(y_test, pred_rf)

0.9

In [154]:
confusion_matrix(y_test, pred_rf)

array([[ 796,  104],
       [  96, 1004]], dtype=int64)

In [141]:
pred_lgbm = lgbm.fit(x_train_scaled, y_train).predict(x_test_scaled)



In [142]:
accuracy_score(y_test, pred_lgbm)

0.93

In [155]:
confusion_matrix(y_test, pred_lgbm)

array([[ 828,   72],
       [  68, 1032]], dtype=int64)

XGBoost
1. optuna -> 0.9205 
2. 0.9125  
3. 'n_estimators': 5000,  
    'learning_rate': 0.05,  
    'max_depth': 20,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'exact',  
    'boosting' : 'dart'  
     => 0.9275  
     .  
4. 'n_estimators': 3000,  
    'learning_rate': 0.05,  
    'max_depth': 20,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'exact',  
    'boosting' : 'dart'  
     => 0.929  
     .    
5. 'n_estimators': 2000,  
    'learning_rate': 0.05,  
    'max_depth': 20,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'exact',  
    'boosting' : 'dart'  
     => 0.9285  
    .    
5. 'n_estimators': 1000,  
    'learning_rate': 0.05,  
    'max_depth': 20,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'exact',  
    'boosting' : 'dart'  
     => 0.9275  
     .    
5. 'n_estimators': 1500,  
    'learning_rate': 0.05,  
    'max_depth': 20,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'exact',  
    'boosting' : 'dart'  
     => 0.9285  
     .    
5. 'n_estimators': 3000,  
    'learning_rate': 0.05,  
    'max_depth': 20,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'gpu_hist',  
    'boosting' : 'dart'  
     => 0.9295  
     .    
5. 'n_estimators': 3000,  
    'learning_rate': 0.045,  
    'max_depth': 21,  
    'predictor': 'gpu_predictor',  
    'tree_method': 'gpu_hist',  
    'boosting' : 'dart'  
     => 0.9305  
     .  
6. max_depth: 22 => 0.928  
7. max_depth: 23 => 0.928  

## Soft Voting (LGBM + XGB + RF)

from sklearn.ensemble import VotingClassifier

In [145]:
from sklearn.ensemble import VotingClassifier

In [146]:
voting_estimators = [ ('LGBM', lgbm),
                    ('XGB', xgb),
                    ('RandomForest', rf)]

In [147]:
voting_model = VotingClassifier(estimators= voting_estimators, voting='soft')

In [148]:
voting_model.fit(x_train_scaled, y_train)
pred_voting = voting_model.predict(x_test_scaled)





In [149]:
accuracy_score(y_test, pred_voting)

0.932

In [156]:
confusion_matrix(y_test, pred_voting)

array([[ 828,   72],
       [  64, 1036]], dtype=int64)

In [159]:
sub_data = pd.read_csv('preprocessed_test.csv', encoding='euc-kr')

In [160]:
sub_data = sub_data.drop(columns=['분석대상'], axis=1)

In [161]:
sub_data

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,5063,9.419514,47691,5.630504,6,1,1,11,68704,24061,...,212,4485,451,312,128,159,135,283,101,277
1,5347,15.560875,83204,5.773314,0,2,14,4,4026,1829,...,824,492,1159,574,90,25,47,22,26,27
2,4523,11.875083,53711,6.146246,0,37,0,1,65732,973,...,249,307,372,289,115,90,248,95,118,77
3,6174,7.378037,45552,6.473256,0,1,17,13,30028,1158,...,366,441,363,469,331,272,457,311,320,291
4,22,7.090909,156,5.324630,0,0,0,1,22922,464,...,0,1,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,473,5.835095,2760,6.547591,0,0,0,3,2367,199,...,28,27,33,45,25,27,26,23,26,30
4996,658,9.749240,6415,6.147939,0,9,0,3,30788,5078,...,170,101,83,48,27,22,27,17,19,40
4997,1111,9.737174,10818,5.775273,0,0,0,4,110979,7409,...,65,80,91,96,57,22,22,77,32,36
4998,1664,6.364183,10590,6.495400,0,1,0,6,114572,6942,...,95,103,116,95,62,59,94,54,74,72


In [162]:
sub_data_scaled = scaler.transform(sub_data)

In [163]:
sub_data_scaled

array([[-0.15266893, -0.12569043, -0.23982573, ..., -0.00654395,
        -0.25943956, -0.01581832],
       [-0.13516034, -0.08462205, -0.11778444, ..., -0.24433172,
        -0.36090951, -0.33972194],
       [-0.18595989, -0.1092696 , -0.21913785, ..., -0.17782403,
        -0.2364397 , -0.27494122],
       ...,
       [-0.39630945, -0.12356618, -0.36654069, ..., -0.19422319,
        -0.35279192, -0.32806141],
       [-0.36221704, -0.14612197, -0.36732422, ..., -0.21517766,
        -0.29596874, -0.28141929],
       [-0.36862863, -0.00586751, -0.25715956, ..., -0.22246618,
        -0.37849764, -0.35138247]])

In [164]:
temp_pred = voting_model.predict(sub_data_scaled)

In [165]:
temp_pred

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [166]:
temp_pred.shape

(5000,)

In [167]:
sub = pd.read_csv('submission_ex.csv', encoding='euc-kr')
sub['정답지'] = temp_pred
sub.to_csv('sub_voting.csv', index = 0, encoding='euc-kr')

In [168]:
compare = pd.DataFrame(columns=['status'])

In [169]:
sub_voting = pd.read_csv('sub_voting.csv', encoding='euc-kr')

In [170]:
sub_rf = pd.read_csv('rf_sub.csv', encoding='euc-kr')

In [171]:
for i in range(5000):
    if sub_voting['정답지'][i] == sub_rf['정답지'][i]:
        compare.loc[i] = 0
    else:
        compare.loc[i] = 1

In [172]:
(compare['status']==0).sum()

4568

In [173]:
(compare['status']==1).sum()

432

In [174]:
(sub_rf['정답지'] == 1).sum()

3485

In [175]:
(sub_rf['정답지'] == 0).sum()

1515

In [176]:
(sub_voting['정답지'] == 1).sum()

3469

In [177]:
(sub_voting['정답지'] == 0).sum()

1531

In [150]:
voting_model2 = VotingClassifier(estimators= voting_estimators, voting='hard')

In [151]:
voting_model2.fit(x_train_scaled, y_train)
pred_voting2 = voting_model.predict(x_test_scaled)





In [152]:
accuracy_score(y_test, pred_voting2)

0.932

In [158]:
confusion_matrix(y_test, pred_voting2)

array([[ 828,   72],
       [  64, 1036]], dtype=int64)

Result  
1. Optuna & voting -> 0.929  
2. based & voting (soft) -> 0.9285
3. based & voting (hard) -> 0.9265

## Remove_Outlier (LGBM)

In [None]:
ft_importance_values = lgbm.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
train = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')
train = train.drop(columns=['분석데이터'], axis=1)
train_df_x = train.drop(['label'], axis=1)
train_df_y = train['label']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [15]:
x_train

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
9254,92,13.500000,1242,5.617271,0,0,0,1,16738,4323,...,9,11,5,12,3,0,2,3,2,0
1561,629,11.941176,7511,6.061898,0,0,0,4,36467,335,...,72,35,84,76,27,13,34,20,21,24
1670,362,5.748619,2081,6.047759,0,0,0,7,393967,32891,...,9,5,8,23,4,19,11,10,7,4
6087,27790,5.742965,159597,6.560686,0,0,0,75,4663,170,...,1796,1502,2255,1308,2056,1381,1569,1239,1632,1320
6669,2042,8.457884,17271,6.306899,0,9,0,3,16454,9185,...,138,138,141,171,114,84,122,99,98,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,728,21.414835,15590,5.339247,0,0,0,1,5083,22,...,151,19,23,172,2,0,0,0,0,0
5191,31774,11.639391,369830,5.401709,0,31,0,1,125002,6049,...,1930,2522,2864,4418,690,381,338,440,231,335
5390,596,7.041946,4197,6.054880,0,0,0,6,220163,20051,...,28,40,19,27,24,18,37,24,11,15
860,205,11.965854,2453,5.786017,0,0,0,1,43248,24338,...,13,15,16,11,1,1,0,1,1,3


In [16]:
outlier_data = pd.concat([x_train, y_train], axis=1)

In [17]:
outlier_data.shape

(8000, 617)

In [18]:
outlier_data

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95,label
9254,92,13.500000,1242,5.617271,0,0,0,1,16738,4323,...,11,5,12,3,0,2,3,2,0,0
1561,629,11.941176,7511,6.061898,0,0,0,4,36467,335,...,35,84,76,27,13,34,20,21,24,1
1670,362,5.748619,2081,6.047759,0,0,0,7,393967,32891,...,5,8,23,4,19,11,10,7,4,1
6087,27790,5.742965,159597,6.560686,0,0,0,75,4663,170,...,1502,2255,1308,2056,1381,1569,1239,1632,1320,0
6669,2042,8.457884,17271,6.306899,0,9,0,3,16454,9185,...,138,141,171,114,84,122,99,98,82,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,728,21.414835,15590,5.339247,0,0,0,1,5083,22,...,19,23,172,2,0,0,0,0,0,1
5191,31774,11.639391,369830,5.401709,0,31,0,1,125002,6049,...,2522,2864,4418,690,381,338,440,231,335,1
5390,596,7.041946,4197,6.054880,0,0,0,6,220163,20051,...,40,19,27,24,18,37,24,11,15,0
860,205,11.965854,2453,5.786017,0,0,0,1,43248,24338,...,15,16,11,1,1,0,1,1,3,1


In [None]:
f, axes = plt.subplots(ncols=2, figsize=(15,8))

sns.boxplot(x='label', y='b_0',data=outlier_data, ax=axes[0])
axes[0].set_title('b_0 vs Label')

sns.boxplot(x='label', y='a_89',data=outlier_data, ax=axes[1])
axes[1].set_title('a_89 vs Label')

In [19]:
import numpy as np

def get_outlier(df=None, column=None, weight=1.5):
  # target 값과 상관관계가 높은 열을 우선적으로 진행
    quantile_25 = np.percentile(df[column].values, 25)
    quantile_75 = np.percentile(df[column].values, 75)
    
    IQR = quantile_75 - quantile_25
    IQR_weight = IQR*weight
    
    lowest = quantile_25 - IQR_weight
    highest = quantile_75 + IQR_weight
  
    outlier_idx = df[column][ (df[column] < lowest) | (df[column] > highest) ].index
    return outlier_idx

In [20]:
outlier_idx = get_outlier(df=outlier_data, column='a_89', weight=1.5)

In [21]:
outlier_data.drop(outlier_idx, axis=0, inplace=True)

In [22]:
outlier_idx = get_outlier(df=outlier_data, column='b_0', weight=1.5)

In [23]:
outlier_data.drop(outlier_idx, axis=0, inplace=True)

In [24]:
x_train = outlier_data.drop(columns=['label'], axis=1)

In [25]:
y_train = outlier_data['label']

In [26]:
x_train.shape, y_train.shape

((6240, 616), (6240,))

In [27]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [28]:
x_train_scaled

array([[-0.47402531, -0.09438057, -0.42125577, ..., -0.24635151,
        -0.39302399, -0.36917917],
       [-0.43981176, -0.10385043, -0.39833036, ..., -0.23200274,
        -0.36766335, -0.33877131],
       [-0.45682297, -0.14147023, -0.41818759, ..., -0.24044319,
        -0.38635014, -0.36411119],
       ...,
       [-0.43350424, -0.04629792, -0.36878588, ..., -0.24888365,
        -0.39569353, -0.36917917],
       [ 1.54451043, -0.10568378,  0.92665126, ...,  0.12249646,
        -0.08736158,  0.05526389],
       [-0.44191427, -0.13361326, -0.41044949, ..., -0.22862655,
        -0.38101105, -0.35017426]])

In [None]:
x_test_scaled

In [None]:
pred_outlier = lgbm.fit(x_train_scaled, y_train).predict(x_test_scaled)

In [None]:
accuracy_score(y_test, pred_outlier)

In [None]:
sub_data = pd.read_csv('preprocessed_test.csv', encoding='euc-kr')

In [None]:
sub_data

In [None]:
sub_data = sub_data.drop(columns=['분석대상'], axis=1)

In [None]:
sub_data

In [None]:
lgbm = lgbm.fit(x_train, y_train)

In [None]:
pred_lgbm = lgbm.predict(sub_data)

In [None]:
sub = pd.read_csv('submission_ex.csv', encoding='euc-kr')
sub['정답지'] = pred_lgbm
sub.to_csv('sub_lgbm.csv', index = 0, encoding='euc-kr')

In [None]:
sub_rf = pd.read_csv('rf_sub.csv', encoding='euc-kr')

In [None]:
compare = pd.DataFrame(columns=['status'])

In [None]:
sub_lgbm = pd.read_csv('sub_lgbm.csv', encoding='euc-kr')

In [None]:
for i in range(5000):
    if sub_lgbm['정답지'][i] == sub_rf['정답지'][i]:
        compare.loc[i] = 0
    else:
        compare.loc[i] = 1

In [None]:
(compare['status']==0).sum()

In [None]:
(compare['status']==1).sum()

In [None]:
(sub_rf['정답지'] == 1).sum()

In [None]:
(sub_rf['정답지'] == 0).sum()

In [None]:
(sub_lgbm['정답지'] == 1).sum()

In [None]:
(sub_lgbm['정답지'] == 0).sum()

In [None]:
(y_train == 0).sum()

In [None]:
(y_train == 1).sum()

In [None]:
predict_proba = lgbm.predict_proba

In [None]:
predict_proba(x_test)[:]

In [None]:
predict_proba(x_test)[:,1]