In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train = train.drop(columns=['분석데이터'], axis=1)

In [4]:
train

Unnamed: 0,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,a_0,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,144,12.298611,1771,5.356616,0,0,0,1,2399,...,10,4,10,9,4,0,1,0,0,0
1,1,804,9.580846,7703,6.063542,0,0,0,6,183376,...,43,121,84,78,47,36,40,45,27,36
2,0,2205,12.736054,28083,6.107050,9,0,0,6,1178,...,326,268,239,286,199,148,154,37,48,36
3,0,2602,10.288240,26770,5.373013,8,0,0,1,56851,...,336,230,206,245,76,0,26,702,1,5
4,1,8980,23.252339,208806,5.775223,0,28,16,3,124274,...,731,882,1171,1010,322,64,327,84,75,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,2018,13.938057,28127,5.940442,0,70,0,11,255044,...,246,186,206,235,88,33,81,58,61,72
9996,0,1105,16.437104,18163,5.766962,0,11,0,3,181296,...,199,57,134,123,20,25,28,25,41,13
9997,0,4,58.500000,234,3.811827,0,0,0,1,68736,...,0,0,0,0,0,0,0,0,0,0
9998,1,3312,24.939312,82599,5.834730,0,39,0,8,90648,...,438,985,806,851,113,123,181,100,75,86


In [None]:
test_size = 0.3
shuffle = True 
random_state = None
from sklearn.model_selection import train_test_split

df_y = train["label"]
df_x = train.drop("label", axis =1)
X_train, X_test , y_train, y_test =\
train_test_split(df_x , df_y , test_size=test_size, random_state=random_state, shuffle=shuffle)

In [None]:
X_test["is_train"] = 0 
X_train["is_train"] = 1
df_combine = pd.concat([X_train, X_test], axis=0, ignore_index=True)
y = df_combine['is_train'].values
x = df_combine.drop('is_train', axis=1).values

In [None]:
tst, trn = X_test.values, X_train.values
m = RandomForestClassifier(n_jobs= -1 , max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y.shape)
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=100)

## https://nzw0301.github.io/2016/02/multiprocessing-paralell
## multiprocessing sklearn
def prob_one_fold(train_idx , test_idx) :
    X_train_fold , X_test_fold = x[train_idx], x[test_idx]
    y_train_fold , y_test_fold = y[train_idx], y[test_idx]
    m.fit(X_train_fold , y_train_fold )
    probs = m.predict_proba(X_test_fold)[:, 1] #calculating the probability
    #predictions[test_idx] = probs
    Output = pd.DataFrame([test_idx , probs]).T
    return Output

from joblib import Parallel, delayed

Pred = Parallel(n_jobs=3)\
(delayed(prob_one_fold)(train_idxs, test_idxs)\
 for train_idxs, test_idxs in skf.split(x, y))
 
OUT = pd.concat(Pred,axis = 0)
OUT.columns = ["idx", "p"]
OUT.sort_values(by = ["idx"] , inplace = True)
OUT.reset_index(drop= True , inplace = True)
predictions = OUT["p"].values
from sklearn.metrics import roc_auc_score as AUC
print("ROC-AUC for train and test distributions:", AUC(y, predictions))

In [5]:
train_df_x = train.drop(['label'], axis=1)

In [6]:
train_df_y = train['label']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_df_x, train_df_y, test_size=0.2, random_state=42)

In [8]:
xgb = XGBClassifier(silent=False,
                              n_estimators = 10000,
                              booster='gbtree',
                              tree_method='gpu_hist',
                              preidctor= 'gpu_predictor',
                              scale_pos_weight=1,
                              learning_rate=0.03689407512484644,
                              objective='binary:logistic',
                              max_depth = 8,
                              subsample = 0.780714581166012,
                              colsample_bytree = 0.3723914688159835,
                              gamma = 0,
                              reg_lambda = 50.0,
                              random_state=42)

In [9]:
lgbm_model = LGBMClassifier(n_estimators = 10000,
                            learning_rate = 0.09416659111369403,
                            max_depth = 43,
                            boosting = 'gbdt',
                            objective = 'binary',
                            metric = 'binary_logloss',
                            is_training_metric = True,
                            num_leaves = 41,
                            min_data_in_leaf = 10,
                            feature_fraction = 0.8,
                            bagging_fraction = 0.9,
                            bagging_freq = 0,
                            alpha = 0.019782149081578264)

In [10]:
model = xgb.fit(x_train, y_train)



Parameters: { "preidctor", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [11]:
model_pred = model.predict(x_test)

In [12]:
roc_auc_score(y_test, model_pred)

0.9192424242424241

In [13]:
pred = lgbm_model.fit(x_train, y_train).predict(x_test)



In [14]:
roc_auc_score(y_test, pred)

0.9238383838383839

In [None]:
# dimensionality reduction using feature importance

df_imp = pd.DataFrame({'imp':model.feature_importances_}, index = model.get_booster().feature_names)
df_imp = df_imp[df_imp.imp > 0].sort_values('imp').copy()

feat_num = df_imp.shape[0]
print("total number of features =", feat_num)
df_imp

In [16]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RepeatedStratifiedKFold

In [17]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
rfe = RFECV(estimator=lgbm_model, cv=cv)

In [18]:
rfe.fit(x_train, y_train)





























































































































































































































































































































































































































































































































































KeyboardInterrupt: 

In [None]:
xgb_pred1 = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, xgb_pred1)

In [None]:
lgbm_pred1 = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, lgbm_pred1)

In [None]:
ft_importance_values = xgb.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (XGB)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
ft_importance_values = lgbm_model.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
x_train = x_train.drop(columns=['b_240'], axis=1)
x_test = x_test.drop(columns=['b_240'], axis=1)

In [None]:
xgb_pred2 = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, xgb_pred2)

In [None]:
lgbm_pred2 = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, lgbm_pred2)

In [None]:
ft_importance_values = xgb.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (XGB)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
ft_importance_values = lgbm_model.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
x_train = x_train.drop(columns=['b_0'], axis=1)
x_test = x_test.drop(columns=['b_0'], axis=1)

In [None]:
xgb_pred3 = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, xgb_pred3)

In [None]:
lgbm_pred3 = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, lgbm_pred3)

In [None]:
ft_importance_values = xgb.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (XGB)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
ft_importance_values = lgbm_model.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
x_train = x_train.drop(columns=['b_255'], axis=1)
x_test = x_test.drop(columns=['b_255'], axis=1)

In [None]:
xgb_pred4 = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, xgb_pred4)

In [None]:
lgbm_pred4 = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, lgbm_pred4)

In [None]:
ft_importance_values = xgb.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (XGB)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
ft_importance_values = lgbm_model.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
x_train = x_train.drop(columns=['a_0'], axis=1)
x_test = x_test.drop(columns=['a_0'], axis=1)

In [None]:
xgb_pred5 = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, xgb_pred5)

In [None]:
lgbm_pred5 = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
roc_auc_score(y_test, lgbm_pred5)

In [None]:
ft_importance_values = xgb.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (XGB)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
ft_importance_values = lgbm_model.feature_importances_

ft_series = pd.Series(ft_importance_values, index = x_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances (LGBM)')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
pred = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
x_train = x_train.drop(columns=['b_245'], axis=1)
x_test = x_test.drop(columns=['b_245'], axis=1)

In [None]:
pred = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
x_train = x_train.drop(columns=['b_244'], axis=1)
x_test = x_test.drop(columns=['b_244'], axis=1)

In [None]:
pred = xgb.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
lgbm_pred = lgbm_model.fit(x_train, y_train).predict(x_test)

In [None]:
accuracy_score(y_test, lgbm_pred)

In [None]:
roc_auc_score(y_test, lgbm_pred)