In [195]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [196]:
train_dataset = pd.read_pickle('Datasets/train_dataset.pkl')
test_dataset = pd.read_pickle('Datasets/test_dataset.pkl')

In [197]:

remove_cols = ['Land',
 'Root_shell',
 'Su_attempted',
 'Is_hot_login',
 'Is_guest_login',
 'Flag_OTH',
 'Flag_RSTO',
 'Flag_RSTOS0',
 'Flag_S1',
 'Flag_S2',
 'Flag_S3',
 'Flag_SH',
 'Num_failed_logins_scaled',
 'Num_file_creations_scaled',
 'Num_access_files_scaled',
 'attack_type',
 'is_attack',
 'attack_category']

final_cols_no_pca = [col for col in train_dataset.columns if (col not in remove_cols) and ('PCA' not in col)]

final_cols_pca = ['Land', 'Logged_in', 'Root_shell', 'Su_attempted', 'Is_hot_login', 'Is_guest_login', 'Protocol_type_icmp',
       'Protocol_type_tcp', 'Protocol_type_udp', 'Flag_OTH', 'Flag_REJ',
       'Flag_RSTO', 'Flag_RSTOS0', 'Flag_RSTR', 'Flag_S0', 'Flag_S1',
       'Flag_S2', 'Flag_S3', 'Flag_SF', 'Flag_SH', 'Service_encoded'] + [('PCA' + str(i)) for i in range(1,14 + 1)]

In [198]:
X_train_no_pca = train_dataset[final_cols_no_pca]
# attack = 1, normal = 0
#y_train_no_pca_is_attack = train_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_train_no_pca_attack_cat = train_dataset['attack_category']

X_test_no_pca = test_dataset[final_cols_no_pca]
# attack = 1, normal = 0
#y_test_no_pca_is_attack = test_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_test_no_pca_attack_cat = test_dataset['attack_category']

In [199]:
train_dataset.head()

Unnamed: 0,Land,Logged_in,Root_shell,Su_attempted,Is_hot_login,Is_guest_login,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,Flag_OTH,...,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14
0,0,0,0,0,0,0,0.0,0.0,1.0,0.0,...,-3.474426,0.903497,-0.694587,0.17223,0.294604,0.150987,0.234315,-0.280996,-0.277381,-0.345195
1,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,0.267239,0.15282,0.044606,0.091991,-0.028531,-0.008606,0.013408,-0.070227,-0.019272,0.013048
2,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.935711,0.39152,-0.006278,0.234796,0.123361,0.141845,0.123492,-0.188316,0.021826,-0.916639
3,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.629514,0.325415,0.044665,0.26645,0.126206,-0.024708,0.152279,-0.27695,0.050839,0.332695
4,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,1.253406,-0.111212,0.0351,-0.160819,-0.337899,-0.085937,-0.275472,0.239464,0.06654,0.157528


In [200]:
X_train_pca = train_dataset[final_cols_pca]
# attack = 1, normal = 0
#y_train_pca_is_attack = train_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_train_pca_attack_cat = train_dataset['attack_category']

X_test_pca = test_dataset[final_cols_pca]
# attack = 1, normal = 0
#y_test_pca_is_attack = test_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_test_pca_attack_cat = test_dataset['attack_category']

## XGB CLASSIFIER

### WITH PCA

In [201]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_pca_attack_cat)))
xgb_model.fit(X_train_pca, y_train_pca_attack_cat)

In [202]:
y_pred = xgb_model.predict(X_test_pca)

In [203]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [204]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))


Accuracy: 64.45%
Precision: 65.71%
Recall: 64.45%
F1 Score: 58.69%


### WITHOUT PCA

### Hyperparameter Tuning

In [205]:
def evaluate(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
        model.__class__.__name__, 
        np.mean(accuracy_scores),
        np.mean(precision_scores),
        np.mean(recall_scores),
        np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [206]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials, STATUS_OK

# optimize f1_score
xgboost_search_space = {'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X_train_no_pca, y_train_no_pca_attack_cat):
    X_train, y_train = X_train_no_pca.iloc[train_index], y_train_no_pca_attack_cat.iloc[train_index]
    X_test, y_test = X_train_no_pca.iloc[test_index], y_train_no_pca_attack_cat.iloc[test_index]

def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = xgboost_search_space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                 
0.8824930527987297                                     
SCORE:                                                                            
0.8805081381500596                                                                
SCORE:                                                                            
0.8930131004366813                                                                
SCORE:                                                                            
0.8947995236204843                                                                
SCORE:                                                                            
0.8949980150853514                                                                
SCORE:                                                                            
0.8866613735609369                                                                
SCORE:                                                    

In [207]:
best_hyperparams

{'colsample_bytree': 0.5444739124605127,
 'gamma': 1.6937366745571834,
 'max_depth': 17.0,
 'min_child_weight': 6.0,
 'reg_alpha': 40.0,
 'reg_lambda': 0.7362796364613553}

In [208]:
optimal_classifier = xgb.XGBClassifier(
    colsample_bytree=best_hyperparams['colsample_bytree'],
    gamma=best_hyperparams['gamma'],
    max_depth= int(best_hyperparams['max_depth']),
    min_child_weight=best_hyperparams['min_child_weight'],
    reg_alpha=best_hyperparams['reg_alpha'],
    reg_lambda=best_hyperparams['reg_lambda']
)

optimal_classifier.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [211]:
y_pred = optimal_classifier.predict(X_test_no_pca)

In [212]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, y_pred)
precision = precision_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, y_pred, average='weighted')

In [213]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 70.18%
Precision: 74.47%
Recall: 70.18%
F1 Score: 63.01%


In [214]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_no_pca_attack_cat)))
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

estimators = [('xgb', xgb_model), ('rf', rf_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_pca_attack_cat))))
                                     
# Split your data into training and validation sets for stacking
X_train, X_val, y_train, y_val = train_test_split(X_train_no_pca, y_train_no_pca_attack_cat, test_size=0.2, random_state=42)

# Train the stacking model on the training data
stacking_model.fit(X_train, y_train)

# Make predictions on the validation data
stacking_pred = stacking_model.predict(X_val)

# Evaluate the stacking model
accuracy = accuracy_score(y_val, stacking_pred)
print("Stacking Model Accuracy on Validation Data: {:.2f}%".format(accuracy * 100))

# Train the stacking model on the full training data
stacking_model.fit(X_train_no_pca, y_train_no_pca_attack_cat)

# Make predictions on the test data
stacking_test_pred = stacking_model.predict(X_test_no_pca)

# Evaluate the stacking model on the test data
test_accuracy = accuracy_score(y_test_no_pca_attack_cat, stacking_test_pred)

print("Stacking Model Accuracy on Test Data: {:.2f}%".format(test_accuracy * 100))

Stacking Model Accuracy on Validation Data: 99.74%
Stacking Model Accuracy on Test Data: 71.32%


In [215]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, stacking_test_pred)
precision = precision_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')



## LOGISTIC REGRESSION with L1 L2 REGULARIZATION

### WITH PCA

In [216]:
lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, multi_class='multinomial', max_iter=1000)
lr.fit(X_train_pca, y_train_pca_attack_cat)

In [217]:
y_pred = lr.predict(X_test_pca)

In [218]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [219]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 57.69%
Precision: 52.77%
Recall: 57.69%
F1 Score: 51.44%


### WITHOUT PCA

### Hyperparamter Tuning

In [220]:
warnings.simplefilter(action='ignore', category=FutureWarning)
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type (L1 or L2)
    'C': [0.001, 0.01, 0.1, 1.0, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Optimization algorithm
    'max_iter': [100, 200, 300]  # Maximum number of iterations for convergence
}

logistic_regression = LogisticRegression()

grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_pca, y_train_pca_attack_cat)
#Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

90 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samsepiol/opt/anaconda3/envs/cs3244/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samsepiol/opt/anaconda3/envs/cs3244/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/samsepiol/opt/anaconda3/envs/cs3244/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfg

Best Hyperparameters: {'C': 100, 'max_iter': 300, 'penalty': 'l2', 'solver': 'lbfgs'}


In [223]:
lr = LogisticRegression(C=best_params['C'], max_iter=best_params['max_iter'], penalty=best_params['penalty'], solver=best_params['solver'])
lr.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [228]:
#lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, multi_class='multinomial', max_iter=1000)
#lr.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [225]:
y_pred = lr.predict(X_test_no_pca)

In [226]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, y_pred)
precision = precision_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, y_pred, average='weighted')

In [227]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 61.79%
Precision: 56.39%
Recall: 61.79%
F1 Score: 55.59%


## LIGHT GBM

### WITH PCA

In [137]:
train_data = lgb.Dataset(X_train_pca, label=y_train_pca_attack_cat)

In [138]:
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_pca_attack_cat)),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3660
[LightGBM] [Info] Number of data points in the train set: 25191, number of used features: 31
[LightGBM] [Info] Start training from score -0.627656
[LightGBM] [Info] Start training from score -1.003594
[LightGBM] [Info] Start training from score -2.398372
[LightGBM] [Info] Start training from score -4.791908
[LightGBM] [Info] Start training from score -7.736347


In [139]:
y_pred = lgb_model.predict(X_test_pca, num_iteration=lgb_model.best_iteration).argmax(axis=1)

In [140]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [141]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 61.07%
Precision: 69.20%
Recall: 61.07%
F1 Score: 58.28%


### WITHOUT PCA

### Hyperparameter Tuning

In [None]:
# optimize f1_score
SEARCH_PARAMS = {'learning_rate': 0.4,
                 'max_depth': 15,
                 'num_leaves': 20,
                 'feature_fraction': 0.8,
                 'subsample': 0.2}

FIXED_PARAMS={'objective': 'binary',
              'metric': 'auc',
              'is_unbalance':True,
              'boosting':'gbdt',
              'num_boost_round':300,
              'early_stopping_rounds':30}

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X_train_no_pca, y_train_no_pca_attack_cat):
    X_train, y_train = X_train_no_pca.iloc[train_index], y_train_no_pca_attack_cat.iloc[train_index]
    X_test, y_test = X_train_no_pca.iloc[test_index], y_train_no_pca_attack_cat.iloc[test_index]

def objective(space):
    clf=lgb.train()
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = xgboost_search_space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [142]:
train_data = lgb.Dataset(X_train_no_pca, label=y_train_no_pca_attack_cat)

In [143]:
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_no_pca_attack_cat)),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2823
[LightGBM] [Info] Number of data points in the train set: 25191, number of used features: 28
[LightGBM] [Info] Start training from score -0.627656
[LightGBM] [Info] Start training from score -1.003594
[LightGBM] [Info] Start training from score -2.398372
[LightGBM] [Info] Start training from score -4.791908
[LightGBM] [Info] Start training from score -7.736347


In [144]:
y_pred = lgb_model.predict(X_test_no_pca, num_iteration=lgb_model.best_iteration).argmax(axis=1)

In [145]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [146]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 70.10%
Precision: 73.40%
Recall: 70.10%
F1 Score: 64.70%
