In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from vecstack import stacking

In [2]:
train = pd.read_csv('preprocessed_train.csv', encoding='euc-kr')

In [3]:
train.head()

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.10705,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.28824,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244


In [4]:
train = train.drop(columns=['분석데이터'], axis=1)

In [5]:
y = train['label']

In [6]:
x = train.drop(columns=['label'], axis=1)

In [7]:
features = x.columns.values

In [8]:
scaler = StandardScaler().fit(x)

In [9]:
x = scaler.transform(x)

In [10]:
x = pd.DataFrame(x, columns=features)

In [11]:
x

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,-0.464725,-0.036652,-0.327447,-0.540002,-0.024953,-0.238306,-0.172949,-0.225030,-0.188685,-0.376343,...,-0.233668,-0.235058,-0.253865,-0.264108,-0.188405,-0.268198,-0.402596,-0.282261,-0.391776,-0.375256
1,-0.423709,-0.041482,-0.310858,0.425357,-0.024953,-0.238306,-0.172949,-0.157390,-0.008605,-0.138858,...,-0.226774,-0.212625,-0.240627,-0.253218,-0.184802,-0.235944,-0.359646,-0.239533,-0.356918,-0.329717
2,-0.336643,-0.035875,-0.253867,0.484770,0.064761,-0.238306,-0.172949,-0.157390,-0.189900,-0.392686,...,-0.167655,-0.184440,-0.212900,-0.220390,-0.172065,-0.135601,-0.234100,-0.247130,-0.329807,-0.329717
3,-0.311971,-0.040225,-0.257538,-0.517611,0.054792,-0.238306,-0.172949,-0.225030,-0.134503,-0.314666,...,-0.165566,-0.191726,-0.218803,-0.226861,-0.182372,-0.268198,-0.375064,0.384291,-0.390485,-0.368931
4,0.084395,-0.017186,0.251516,0.031637,-0.024953,0.385573,6.611027,-0.197974,-0.067414,-0.219942,...,-0.083049,-0.066716,-0.046178,-0.106125,-0.161759,-0.210858,-0.043579,-0.202503,-0.294949,-0.066602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.348264,-0.033739,-0.253743,0.257255,-0.024953,1.321392,-0.172949,-0.089749,0.062708,0.133147,...,-0.184367,-0.200163,-0.218803,-0.228439,-0.181366,-0.238632,-0.314493,-0.227190,-0.313023,-0.284177
9996,-0.405003,-0.029298,-0.281607,0.020355,-0.024953,0.006789,-0.172949,-0.197974,-0.010674,-0.239500,...,-0.194185,-0.224896,-0.231683,-0.246115,-0.187064,-0.245800,-0.372861,-0.258524,-0.338844,-0.358811
9997,-0.473426,0.045452,-0.331745,-2.649522,-0.024953,-0.238306,-0.172949,-0.225030,-0.122677,-0.323108,...,-0.235757,-0.235825,-0.255653,-0.265528,-0.188740,-0.268198,-0.403697,-0.282261,-0.391776,-0.375256
9998,-0.267848,-0.014189,-0.101415,0.112897,-0.024953,0.630669,-0.172949,-0.130334,-0.100873,-0.353331,...,-0.144258,-0.046968,-0.111471,-0.131219,-0.179271,-0.157999,-0.204366,-0.187311,-0.294949,-0.266468


In [None]:
pca = PCA(n_components=616)

In [None]:
pca_array = pca.fit_transform(x)
pca_df = pd.DataFrame(pca_array, index=x.index,
                      columns=[f"pca{num+1}" for num in range(x.shape[1])])
pca_df.head()

In [None]:
# 주성분의 설명력과 기여율 구하기
result = pd.DataFrame({'설명가능한 분산 비율(고윳값)':pca.explained_variance_,
             '기여율':pca.explained_variance_ratio_},
            index=np.array([f"pca{num+1}" for num in range(x.shape[1])]))
result['누적기여율'] = result['기여율'].cumsum()
result

In [None]:
result[0:30]

In [None]:
result[31:60]

In [None]:
result[61:80]

In [None]:
result[81:100]

In [None]:
result[200:210]

pca25 : ~0.800575  
pca60 : ~0.900173  
pca100 : ~0.950234  
pca210 : ~0.990096

In [None]:
x

In [None]:
pca = PCA(n_components=25)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data=principalComponents)

In [None]:
principalDf

In [None]:
lgbm_model = LGBMClassifier(n_estimators = 10000,
                            learning_rate = 0.09416659111369403,
                            max_depth = 43,
                            boosting = 'gbdt',
                            objective = 'binary',
                            metric = 'binary_logloss',
                            is_training_metric = True,
                            num_leaves = 41,
                            min_data_in_leaf = 10,
                            feature_fraction = 0.8,
                            bagging_fraction = 0.9,
                            bagging_freq = 0,
                            alpha = 0.019782149081578264)

In [None]:
xgb = XGBClassifier(silent=False,
                    n_estimators = 10000,
                    booster='gbtree',
                    tree_method='gpu_hist',
                    preidctor= 'gpu_predictor',
                    scale_pos_weight=1,
                    learning_rate=0.03689407512484644,
                    objective='binary:logistic',
                    max_depth = 8,
                    subsample = 0.780714581166012,
                    colsample_bytree = 0.3723914688159835,
                    gamma = 0,
                    reg_lambda = 50.0,
                    random_state=42)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(principalDf, y, test_size=0.2, random_state=42)

In [None]:
pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
roc_auc_score(y_test, pred)

In [None]:
pred = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
roc_auc_score(y_test, pred)

In [None]:
pca = PCA(n_components=60)
principalComponents = pca.fit_transform(x)
principalDf2 = pd.DataFrame(data=principalComponents)

In [None]:
principalDf2

In [None]:
x_train, x_test, y_train, y_test = train_test_split(principalDf2, y, test_size=0.2, random_state=42)

In [None]:
pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
roc_auc_score(y_test, pred)

In [None]:
pca = PCA(n_components=100)
principalComponents = pca.fit_transform(x)
principalDf3 = pd.DataFrame(data=principalComponents)

In [None]:
principalDf3

In [None]:
x_train, x_test, y_train, y_test = train_test_split(principalDf3, y, test_size=0.2, random_state=42)

In [None]:
x_train.head()

In [None]:
pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
roc_auc_score(y_test, pred)

In [None]:
pca = PCA(n_components=210)
principalComponents = pca.fit_transform(x)
principalDf4 = pd.DataFrame(data=principalComponents)

In [None]:
principalDf4

In [None]:
x_train, x_test, y_train, y_test = train_test_split(principalDf4, y, test_size=0.2, random_state=42)

In [None]:
x_train.head()

In [None]:
pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
roc_auc_score(y_test, pred)

In [None]:
lgbm_model = LGBMClassifier(n_estimators = 10000, learning_rate = 0.01)

In [None]:
pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [23]:
xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'tree_method': 'gpu_hist',
               'gpu_id': 0,
               'predictor': 'gpu_predictor',
               'random_state': 42 }

lgb_params = {'n_estimators': 10000,
              'learning_rate':0.09416659111369403,
              'max_depth':43,
              'boosting':'gbdt',
              'objective': 'binary',
              'metric': 'binary_logloss',
              'is_training_metric': True,
              'num_leaves':41,
              'min_data_in_leaf':10,
              'feature_fraction':0.8,
              'bagging_fraction':0.9,
              'bagging_freq':0,
              'alpha': 0.019782149081578264 }

cat_params = {'objective': 'CrossEntropy',
              'colsample_bylevel': 0.043529438827711514,
              'depth': 12,
              'boosting_type': 'Ordered',
              'bootstrap_type': 'Bernoulli',
              'learning_rate': 0.19719860541901787,
              'iterations': 205,
              'random_strength': 34,
              'od_type': 'IncToDec',
              'subsample': 0.9558805603499683
             }

In [24]:
lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [None]:
models = [lgbm, xgb, cat]

In [None]:
S_train, S_test = stacking(models, x_train, y_train, x_test, regression=False,
                          metric=accuracy_score, n_folds=5, stratified=True, shuffle=True, random_state=42, verbose=2)

In [None]:
model = lgbm

In [None]:
model = model.fit(S_train, y_train)

In [None]:
y_pred = model.predict(S_test)

In [None]:
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

In [12]:
test = pd.read_csv('preprocessed_test.csv', encoding='euc-kr')

In [13]:
test = test.drop(columns=['분석대상'], axis=1)

In [14]:
test.head()

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,5063,9.419514,47691,5.630504,6,1,1,11,68704,24061,...,212,4485,451,312,128,159,135,283,101,277
1,5347,15.560875,83204,5.773314,0,2,14,4,4026,1829,...,824,492,1159,574,90,25,47,22,26,27
2,4523,11.875083,53711,6.146246,0,37,0,1,65732,973,...,249,307,372,289,115,90,248,95,118,77
3,6174,7.378037,45552,6.473256,0,1,17,13,30028,1158,...,366,441,363,469,331,272,457,311,320,291
4,22,7.090909,156,5.32463,0,0,0,1,22922,464,...,0,1,3,0,0,0,0,0,0,0


In [15]:
len(features)

616

In [16]:
test = scaler.transform(test)

In [17]:
test = pd.DataFrame(test, columns=features)

In [18]:
test.head()

Unnamed: 0,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,a_1,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,-0.15903,-0.041769,-0.199034,-0.165988,0.034856,-0.216025,0.25105,-0.089749,-0.122708,0.329324,...,-0.19147,0.624097,-0.174976,-0.216287,-0.178014,-0.125745,-0.255024,-0.013551,-0.261382,-0.024858
1,-0.141381,-0.030855,-0.099724,0.029029,-0.024953,-0.193744,5.76303,-0.184446,-0.187066,-0.33858,...,-0.063621,-0.141492,-0.048324,-0.174936,-0.181199,-0.2458,-0.351937,-0.261372,-0.358209,-0.341101
2,-0.192589,-0.037405,-0.182199,0.538295,-0.024953,0.586106,-0.172949,-0.22503,-0.125666,-0.364296,...,-0.18374,-0.176963,-0.189108,-0.219917,-0.179104,-0.187564,-0.13058,-0.192058,-0.239435,-0.277853
3,-0.089986,-0.045396,-0.205015,0.984851,-0.024953,-0.216025,7.035025,-0.062693,-0.161193,-0.358738,...,-0.159299,-0.151271,-0.190718,-0.191508,-0.161005,-0.024506,0.099587,0.013035,0.021351,-0.007148
4,-0.472307,-0.045907,-0.331963,-0.583681,-0.024953,-0.238306,-0.172949,-0.22503,-0.168264,-0.379588,...,-0.235757,-0.235633,-0.255117,-0.265528,-0.18874,-0.268198,-0.403697,-0.282261,-0.391776,-0.375256


In [19]:
pca = PCA(n_components=210)
pca = pca.fit(x)

In [21]:
principalComponents = pca.fit_transform(test)
principalDf_test = pd.DataFrame(data=principalComponents)

In [22]:
principalDf_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,-0.479893,-1.042447,-3.695223,-0.132008,0.094294,-0.208262,0.773192,0.083199,-0.140801,0.793633,...,0.283212,-0.139991,0.064927,-0.126348,0.183153,-0.025451,-0.166701,0.077189,-0.107480,0.029718
1,-0.500069,-1.785700,-3.707350,-0.103911,-0.212310,-0.366211,-1.012221,-0.543733,-0.398851,-0.464197,...,0.002146,-0.004325,0.020898,-0.069919,-0.033707,-0.007819,0.026685,-0.024289,-0.045168,0.002403
2,-0.494533,-1.724616,-3.818495,-0.098357,-0.118539,-0.271956,-0.343362,-0.178042,-0.012371,-0.370525,...,0.062262,-0.042090,0.054276,-0.088681,-0.002868,-0.129570,0.089823,-0.059098,-0.097344,-0.062695
3,-0.506916,-1.894428,-4.301183,-0.099206,-0.111321,-0.104390,-0.680470,-0.183449,0.052488,-0.405840,...,0.015599,0.022321,-0.065925,0.031570,0.024398,-0.003170,-0.037472,0.019177,0.003601,0.004840
4,-0.515042,-1.965683,-4.359528,-0.096262,-0.171784,-0.442914,-0.794846,-0.627529,-0.862421,-0.409338,...,0.016201,-0.000173,-0.009450,0.003514,-0.011247,-0.007632,-0.010747,0.000582,-0.006895,0.004068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,-0.522781,-2.102479,-4.810410,-0.089096,-0.138691,-0.429368,-0.544981,-0.561986,-0.782731,-0.486986,...,0.013140,0.008911,-0.010168,-0.001211,-0.000698,-0.006150,-0.020525,-0.003843,-0.013742,0.010339
4996,-0.406731,-0.266547,1.277825,-0.147099,-0.443486,-0.494569,-2.094268,-0.813846,-0.802809,-0.557197,...,-0.032559,-0.020936,0.009406,0.034586,-0.052359,-0.023719,-0.032103,-0.010603,-0.013549,-0.036253
4997,-0.457099,-1.169472,-2.538778,-0.087031,-0.044838,-0.434240,1.221197,-0.260945,-0.729430,-0.410401,...,-0.089402,0.044033,0.031802,0.203404,0.030040,0.268306,-0.101533,-0.136026,-0.009210,0.088656
4998,-0.415299,-0.593689,-0.895969,-0.067875,0.123391,-0.418352,3.373624,0.189995,-0.609644,-1.548955,...,0.049782,-0.035786,0.013540,-0.038184,-0.083320,-0.056936,0.009953,-0.007405,-0.060769,-0.004641


In [36]:
lgbm.fit(x, y)

LGBMClassifier(alpha=0.019782149081578264, bagging_fraction=0.9, bagging_freq=0,
               boosting='gbdt', feature_fraction=0.8, is_training_metric=True,
               learning_rate=0.09416659111369403, max_depth=43,
               metric='binary_logloss', min_data_in_leaf=10, n_estimators=10000,
               num_leaves=41, objective='binary')

In [32]:
x = pca.transform(x)

In [37]:
sub_pred = lgbm.predict(principalDf_test)

In [38]:
sub_rf = pd.read_csv('rf_sub.csv', encoding='euc-kr')

In [39]:
sub = pd.read_csv('submission_ex.csv', encoding='euc-kr')
sub['정답지'] = sub_pred
sub.to_csv('sub_pca.csv', index = 0, encoding='euc-kr')

In [40]:
sub_pca = pd.read_csv('sub_pca.csv', encoding='euc-kr')

In [41]:
compare = pd.DataFrame(columns=['status'])

In [42]:
compare

Unnamed: 0,status


In [43]:
for i in range(5000):
    if sub_pca['정답지'][i] == sub_rf['정답지'][i]:
        compare.loc[i] = 0
    else:
        compare.loc[i] = 1

In [44]:
(compare['status']==0).sum()

3866

In [45]:
(compare['status']==1).sum()

1134

In [50]:
sub_stacking = pd.read_csv('sub_stack.csv', encoding='euc-kr')

In [46]:
(sub_rf['정답지'] == 1).sum()

3485

In [47]:
(sub_rf['정답지'] == 0).sum()

1515

In [48]:
(sub_pca['정답지'] == 1).sum()

2901

In [49]:
(sub_pca['정답지'] == 0).sum()

2099

In [51]:
(sub_stacking['정답지'] == 1).sum()

3456

In [52]:
(sub_stacking['정답지'] == 0).sum()

1544