In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
train_dataset = pd.read_pickle('Datasets/train_dataset.pkl')
test_dataset = pd.read_pickle('Datasets/test_dataset.pkl')

In [5]:

remove_cols = ['Land',
 'Root_shell',
 'Su_attempted',
 'Is_hot_login',
 'Is_guest_login',
 'Flag_OTH',
 'Flag_RSTO',
 'Flag_RSTOS0',
 'Flag_S1',
 'Flag_S2',
 'Flag_S3',
 'Flag_SH',
 'Num_failed_logins_scaled',
 'Num_file_creations_scaled',
 'Num_access_files_scaled',
 'attack_type',
 'is_attack',
 'attack_category']

final_cols_no_pca = [col for col in train_dataset.columns if (col not in remove_cols) and ('PCA' not in col)]

final_cols_pca = ['Land', 'Logged_in', 'Root_shell', 'Su_attempted', 'Is_hot_login', 'Is_guest_login', 'Protocol_type_icmp',
       'Protocol_type_tcp', 'Protocol_type_udp', 'Flag_OTH', 'Flag_REJ',
       'Flag_RSTO', 'Flag_RSTOS0', 'Flag_RSTR', 'Flag_S0', 'Flag_S1',
       'Flag_S2', 'Flag_S3', 'Flag_SF', 'Flag_SH', 'Service_encoded'] + [('PCA' + str(i)) for i in range(1,14 + 1)]

In [6]:
X_train_no_pca = train_dataset[final_cols_no_pca]
# attack = 1, normal = 0
#y_train_no_pca_is_attack = train_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_train_no_pca_attack_cat = train_dataset['attack_category']

X_test_no_pca = test_dataset[final_cols_no_pca]
# attack = 1, normal = 0
#y_test_no_pca_is_attack = test_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_test_no_pca_attack_cat = test_dataset['attack_category']

In [7]:
train_dataset.head()

Unnamed: 0,Land,Logged_in,Root_shell,Su_attempted,Is_hot_login,Is_guest_login,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,Flag_OTH,...,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14
0,0,0,0,0,0,0,0.0,0.0,1.0,0.0,...,-3.474426,0.903497,-0.694587,0.17223,0.294604,0.150987,0.234315,-0.280996,-0.277381,-0.345195
1,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,0.267239,0.15282,0.044606,0.091991,-0.028531,-0.008606,0.013408,-0.070227,-0.019272,0.013048
2,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.935711,0.39152,-0.006278,0.234796,0.123361,0.141845,0.123492,-0.188316,0.021826,-0.916639
3,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.629514,0.325415,0.044665,0.26645,0.126206,-0.024708,0.152279,-0.27695,0.050839,0.332695
4,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,1.253406,-0.111212,0.0351,-0.160819,-0.337899,-0.085937,-0.275472,0.239464,0.06654,0.157528


In [8]:
X_train_pca = train_dataset[final_cols_pca]
# attack = 1, normal = 0
#y_train_pca_is_attack = train_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_train_pca_attack_cat = train_dataset['attack_category']

X_test_pca = test_dataset[final_cols_pca]
# attack = 1, normal = 0
#y_test_pca_is_attack = test_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_test_pca_attack_cat = test_dataset['attack_category']

## XGB CLASSIFIER

### WITH PCA

In [86]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_pca_attack_cat)))
xgb_model.fit(X_train_pca, y_train_pca_attack_cat)

In [87]:
y_pred = xgb_model.predict(X_test_pca)

In [88]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [89]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))


Accuracy: 64.45%
Precision: 65.71%
Recall: 64.45%
F1 Score: 58.69%


### WITHOUT PCA

In [90]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_no_pca_attack_cat)))
xgb_model.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [91]:
#param_grid = {
 #   'max_depth': [3, 4, 5],
 #   'learning_rate': [0.1, 0.01, 0.001],
 #   'n_estimators': [100, 200, 300],
#}

#grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
#grid_search.fit(X_train_pca, y_train_pca_attack_cat)
# Get the best hyperparameters
#best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)
# Get the best model
#best_model = grid_search.best_estimator_

In [92]:
y_pred = xgb_model.predict(X_test_no_pca)

In [93]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, y_pred)
precision = precision_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, y_pred, average='weighted')

In [94]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 70.82%
Precision: 75.45%
Recall: 70.82%
F1 Score: 65.16%


In [13]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_no_pca_attack_cat)))
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

estimators = [('xgb', xgb_model), ('rf', rf_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_pca_attack_cat))))
                                     
# Split your data into training and validation sets for stacking
X_train, X_val, y_train, y_val = train_test_split(X_train_no_pca, y_train_no_pca_attack_cat, test_size=0.2, random_state=42)

# Train the stacking model on the training data
stacking_model.fit(X_train, y_train)

# Make predictions on the validation data
stacking_pred = stacking_model.predict(X_val)

# Evaluate the stacking model
accuracy = accuracy_score(y_val, stacking_pred)
print("Stacking Model Accuracy on Validation Data: {:.2f}%".format(accuracy * 100))

# Train the stacking model on the full training data
stacking_model.fit(X_train_no_pca, y_train_no_pca_attack_cat)

# Make predictions on the test data
stacking_test_pred = stacking_model.predict(X_test_no_pca)

# Evaluate the stacking model on the test data
test_accuracy = accuracy_score(y_test_no_pca_attack_cat, stacking_test_pred)

print("Stacking Model Accuracy on Test Data: {:.2f}%".format(test_accuracy * 100))

Stacking Model Accuracy on Validation Data: 99.74%
Stacking Model Accuracy on Test Data: 71.32%


In [16]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, stacking_test_pred)
precision = precision_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 71.32%
Precision: 75.92%
Recall: 71.32%
F1 Score: 66.66%


## LOGISTIC REGRESSION with L1 L2 REGULARIZATION

### WITH PCA

In [95]:
lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, multi_class='multinomial', max_iter=1000)
lr.fit(X_train_pca, y_train_pca_attack_cat)



In [96]:
y_pred = lr.predict(X_test_pca)

In [97]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [98]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 57.69%
Precision: 52.77%
Recall: 57.69%
F1 Score: 51.44%


### WITHOUT PCA

In [99]:
lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, multi_class='multinomial', max_iter=1000)
lr.fit(X_train_no_pca, y_train_no_pca_attack_cat)



In [100]:
y_pred = lr.predict(X_test_no_pca)

In [101]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, y_pred)
precision = precision_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 58.70%
Precision: 57.55%
Recall: 58.70%
F1 Score: 52.42%


## LIGHT GBM

### WITH PCA

In [103]:
train_data = lgb.Dataset(X_train_pca, label=y_train_pca_attack_cat)

In [104]:
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_pca_attack_cat)),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3660
[LightGBM] [Info] Number of data points in the train set: 25191, number of used features: 31
[LightGBM] [Info] Start training from score -0.627656
[LightGBM] [Info] Start training from score -1.003594
[LightGBM] [Info] Start training from score -2.398372
[LightGBM] [Info] Start training from score -4.791908
[LightGBM] [Info] Start training from score -7.736347


In [105]:
y_pred = lgb_model.predict(X_test_pca, num_iteration=lgb_model.best_iteration).argmax(axis=1)

In [106]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [107]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 61.18%
Precision: 69.42%
Recall: 61.18%
F1 Score: 58.46%


### WITHOUT PCA

In [108]:
train_data = lgb.Dataset(X_train_no_pca, label=y_train_no_pca_attack_cat)

In [109]:
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_no_pca_attack_cat)),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2823
[LightGBM] [Info] Number of data points in the train set: 25191, number of used features: 28
[LightGBM] [Info] Start training from score -0.627656
[LightGBM] [Info] Start training from score -1.003594
[LightGBM] [Info] Start training from score -2.398372
[LightGBM] [Info] Start training from score -4.791908
[LightGBM] [Info] Start training from score -7.736347


In [110]:
y_pred = lgb_model.predict(X_test_no_pca, num_iteration=lgb_model.best_iteration).argmax(axis=1)

In [111]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [112]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 70.10%
Precision: 73.40%
Recall: 70.10%
F1 Score: 64.70%
