In [242]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [243]:
train_dataset = pd.read_csv('Datasets/train_dataset.csv')
test_dataset = pd.read_csv('Datasets/test_dataset.csv')

PCA_train_dataset = pd.read_csv('Datasets/PCA_train_data.csv')
PCA_test_dataset = pd.read_csv('Datasets/PCA_test_data.csv')

Tree_train_dataset = pd.read_csv('Datasets/Tree_train_data.csv')
Tree_test_dataset = pd.read_csv('Datasets/Tree_test_data.csv')

In [244]:
X_train = train_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_train = train_dataset['attack_category']

X_test = test_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_test = test_dataset['attack_category']

X_pca_train = PCA_train_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_pca_train = PCA_train_dataset["attack_category"]
X_pca_test = PCA_test_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_pca_test= PCA_test_dataset["attack_category"]

X_tree_train = Tree_train_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_tree_train = Tree_train_dataset["attack_category"]
X_tree_test = Tree_test_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_tree_test= Tree_test_dataset["attack_category"]

In [245]:
def train_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
    #     model.__class__.__name__, 
    #     np.mean(accuracy_scores),
    #     np.mean(precision_scores),
    #     np.mean(recall_scores),
    #     np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [246]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Model: ", model.__class__.__name__)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("####################################")
    
    # report = classification_report(y_test, y_pred)
    # print("Classification Report:\n", report)
    
    return f1

In [247]:
def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Best Parameters:', best)
    return best 

In [248]:
# Set random_states as 42 
rs_value = 42

# Hyperparameter Tuning for XGB, LOGREG, LGBM

In [249]:
def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Beat Parameters:', best)
    return best 

### XGBoost

In [250]:
# optimize f1_score
xgboost_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def xgboost_objective_func(search_space):
    xgboost_clf = xgb.XGBClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=0
    )
    
    return train_model(xgboost_clf, X_pca_train, y_pca_train)

xgboost_best = best_params(xgboost_objective_func, xgboost_search_space)

 10%|█         | 5/50 [25:09<4:26:31, 355.37s/trial, best loss: -0.9948858870461976]

In [None]:
xgb_model = xgb.XGBClassifier(random_state = rs_value)

xgboost_clf = xgb.XGBClassifier(
    learning_rate=xgboost_best['learning_rate'],
    n_estimators=int(xgboost_best['n_estimators']),
    max_depth=int(xgboost_best['max_depth']),
    min_child_weight=int(xgboost_best['min_child_weight']),
    subsample=xgboost_best['subsample'],
    colsample_bytree=xgboost_best['colsample_bytree'],
    random_state=0
)

evaluate(xgboost_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)


### Logistic Regression

### Light GBM

In [None]:

remove_cols = ['Land',
 'Root_shell',
 'Su_attempted',
 'Is_hot_login',
 'Is_guest_login',
 'Flag_OTH',
 'Flag_RSTO',
 'Flag_RSTOS0',
 'Flag_S1',
 'Flag_S2',
 'Flag_S3',
 'Flag_SH',
 'Num_failed_logins_scaled',
 'Num_file_creations_scaled',
 'Num_access_files_scaled',
 'attack_type',
 'is_attack',
 'attack_category']

final_cols_no_pca = [col for col in train_dataset.columns if (col not in remove_cols) and ('PCA' not in col)]

final_cols_pca = ['Land', 'Logged_in', 'Root_shell', 'Su_attempted', 'Is_hot_login', 'Is_guest_login', 'Protocol_type_icmp',
       'Protocol_type_tcp', 'Protocol_type_udp', 'Flag_OTH', 'Flag_REJ',
       'Flag_RSTO', 'Flag_RSTOS0', 'Flag_RSTR', 'Flag_S0', 'Flag_S1',
       'Flag_S2', 'Flag_S3', 'Flag_SF', 'Flag_SH', 'Service_encoded'] + [('PCA' + str(i)) for i in range(1,14 + 1)]

In [None]:
X_train_no_pca = train_dataset[final_cols_no_pca]
# attack = 1, normal = 0
#y_train_no_pca_is_attack = train_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_train_no_pca_attack_cat = train_dataset['attack_category']

X_test_no_pca = test_dataset[final_cols_no_pca]
# attack = 1, normal = 0
#y_test_no_pca_is_attack = test_dataset['is_attack']
# attack_category 0 (normal),1 (dos),2 (probe),3 (r2l),4 (u2r)
y_test_no_pca_attack_cat = test_dataset['attack_category']

## XGB CLASSIFIER

### WITH PCA

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_pca_attack_cat)))
xgb_model.fit(X_train_pca, y_train_pca_attack_cat)

In [None]:
y_pred = xgb_model.predict(X_test_pca)

In [None]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))


### WITHOUT PCA

### Hyperparameter Tuning

In [None]:
def evaluate(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
        model.__class__.__name__, 
        np.mean(accuracy_scores),
        np.mean(precision_scores),
        np.mean(recall_scores),
        np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [None]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials, STATUS_OK

# optimize f1_score
xgboost_search_space = {'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X_train_no_pca, y_train_no_pca_attack_cat):
    X_train, y_train = X_train_no_pca.iloc[train_index], y_train_no_pca_attack_cat.iloc[train_index]
    X_test, y_test = X_train_no_pca.iloc[test_index], y_train_no_pca_attack_cat.iloc[test_index]

def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = xgboost_search_space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
best_hyperparams

In [None]:
optimal_classifier = xgb.XGBClassifier(
    colsample_bytree=best_hyperparams['colsample_bytree'],
    gamma=best_hyperparams['gamma'],
    max_depth= int(best_hyperparams['max_depth']),
    min_child_weight=best_hyperparams['min_child_weight'],
    reg_alpha=best_hyperparams['reg_alpha'],
    reg_lambda=best_hyperparams['reg_lambda']
)

optimal_classifier.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [None]:
y_pred = optimal_classifier.predict(X_test_no_pca)

In [None]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, y_pred)
precision = precision_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, y_pred, average='weighted')

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_no_pca_attack_cat)))
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

estimators = [('xgb', xgb_model), ('rf', rf_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=xgb.XGBClassifier(objective="multi:softprob", num_class=len(set(y_train_pca_attack_cat))))
                                     
# Split your data into training and validation sets for stacking
X_train, X_val, y_train, y_val = train_test_split(X_train_no_pca, y_train_no_pca_attack_cat, test_size=0.2, random_state=42)

# Train the stacking model on the training data
stacking_model.fit(X_train, y_train)

# Make predictions on the validation data
stacking_pred = stacking_model.predict(X_val)

# Evaluate the stacking model
accuracy = accuracy_score(y_val, stacking_pred)
print("Stacking Model Accuracy on Validation Data: {:.2f}%".format(accuracy * 100))

# Train the stacking model on the full training data
stacking_model.fit(X_train_no_pca, y_train_no_pca_attack_cat)

# Make predictions on the test data
stacking_test_pred = stacking_model.predict(X_test_no_pca)

# Evaluate the stacking model on the test data
test_accuracy = accuracy_score(y_test_no_pca_attack_cat, stacking_test_pred)

print("Stacking Model Accuracy on Test Data: {:.2f}%".format(test_accuracy * 100))

In [None]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, stacking_test_pred)
precision = precision_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, stacking_test_pred, average='weighted')



## LOGISTIC REGRESSION with L1 L2 REGULARIZATION

### WITH PCA

In [None]:
lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, multi_class='multinomial', max_iter=1000)
lr.fit(X_train_pca, y_train_pca_attack_cat)

In [None]:
y_pred = lr.predict(X_test_pca)

In [None]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

### WITHOUT PCA

### Hyperparamter Tuning

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type (L1 or L2)
    'C': [0.001, 0.01, 0.1, 1.0, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Optimization algorithm
    'max_iter': [100, 200, 300]  # Maximum number of iterations for convergence
}

logistic_regression = LogisticRegression()

grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_pca, y_train_pca_attack_cat)
#Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
lr = LogisticRegression(C=best_params['C'], max_iter=best_params['max_iter'], penalty=best_params['penalty'], solver=best_params['solver'])
lr.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [None]:
#lr = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, multi_class='multinomial', max_iter=1000)
#lr.fit(X_train_no_pca, y_train_no_pca_attack_cat)

In [None]:
y_pred = lr.predict(X_test_no_pca)

In [None]:
accuracy = accuracy_score(y_test_no_pca_attack_cat, y_pred)
precision = precision_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_no_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_no_pca_attack_cat, y_pred, average='weighted')

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

## LIGHT GBM

### WITH PCA

In [None]:
train_data = lgb.Dataset(X_train_pca, label=y_train_pca_attack_cat)

In [None]:
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_pca_attack_cat)),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

In [None]:
y_pred = lgb_model.predict(X_test_pca, num_iteration=lgb_model.best_iteration).argmax(axis=1)

In [None]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

### WITHOUT PCA

### Hyperparameter Tuning

In [None]:
# optimize f1_score
SEARCH_PARAMS = {'learning_rate': 0.4,
                 'max_depth': 15,
                 'num_leaves': 20,
                 'feature_fraction': 0.8,
                 'subsample': 0.2}

FIXED_PARAMS={'objective': 'binary',
              'metric': 'auc',
              'is_unbalance':True,
              'boosting':'gbdt',
              'num_boost_round':300,
              'early_stopping_rounds':30}

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X_train_no_pca, y_train_no_pca_attack_cat):
    X_train, y_train = X_train_no_pca.iloc[train_index], y_train_no_pca_attack_cat.iloc[train_index]
    X_test, y_test = X_train_no_pca.iloc[test_index], y_train_no_pca_attack_cat.iloc[test_index]

def objective(space):
    clf=lgb.train()
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = xgboost_search_space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
train_data = lgb.Dataset(X_train_no_pca, label=y_train_no_pca_attack_cat)

In [None]:
params = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_no_pca_attack_cat)),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
}

lgb_model = lgb.train(params, train_data, num_boost_round=100)

In [None]:
y_pred = lgb_model.predict(X_test_no_pca, num_iteration=lgb_model.best_iteration).argmax(axis=1)

In [None]:
accuracy = accuracy_score(y_test_pca_attack_cat, y_pred)
precision = precision_score(y_test_pca_attack_cat, y_pred, average='weighted')
recall = recall_score(y_test_pca_attack_cat, y_pred, average='weighted')
f1 = f1_score(y_test_pca_attack_cat, y_pred, average='weighted')

In [None]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))