### Loading Required Libraries & Functions

In [7]:
# Importing useful libraries
import numpy as np
import pandas as pd
import pickle
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score

In [8]:
# Defining util Functions
def conv_array(df):
    x,y=df.drop('Class',axis=1),df['Class'].values
    x=x.values
    y0=np.ones(len(y),np.int8)
    y0[np.where(y=='normal')]=0
    y0[np.where(y=='dos')]=1
    y0[np.where(y=='r2l')]=2
    y0[np.where(y=='u2r')]=3
    y0[np.where(y=='probe')]=4
    return x,y,y0

# Function for saving trained models
def save_model(model, filename="model.sav"):
    pickle.dump(model, open(filename, 'wb'))
    print("Model has been saved at: ", filename)

### Loading and Cleaning Dataset

In [None]:
# Reading the data from CSV files using Pandas

training_set_path = r"Dataset\KDDTrain.csv"
test_set_path = r"Dataset\KDDTest"

training_df = pd.read_csv(training_set_path, header=None)
testing_df = pd.read_csv(test_set_path, header=None)

print("Training set has {} rows.".format(len(training_df)))
print("Testing set has {} rows.".format(len(testing_df)))

In [None]:
# Adding Column names to Dataset

columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'difficulty']
print(f"we have {len(columns)} feature")
training_df.columns = columns
testing_df.columns = columns

In [5]:
# A list ot attack names that belong to each general attack type
dos_attacks=["snmpgetattack","back","land","neptune","smurf","teardrop","pod","apache2","udpstorm","processtable","mailbomb"]
r2l_attacks=["snmpguess","worm","httptunnel","named","xlock","xsnoop","sendmail","ftp_write","guess_passwd","imap","multihop","phf","spy","warezclient","warezmaster"]
u2r_attacks=["sqlattack","buffer_overflow","loadmodule","perl","rootkit","xterm","ps"]
probe_attacks=["ipsweep","nmap","portsweep","satan","saint","mscan"]

# Helper function to label samples to 5 classes
def label_attack (row):
    if row["outcome"] in dos_attacks:
        return "dos"
    if row["outcome"] in r2l_attacks:
        return "r2l"
    if row["outcome"] in u2r_attacks:
        return "u2r"
    if row["outcome"] in probe_attacks:
        return "probe"                        
    return "normal"


# We combine the datasets temporarily to do the labeling 
test_samples_length = len(testing_df)
df=pd.concat([training_df,testing_df])
df["Class"]=df.apply(label_attack,axis=1)

# The old outcome field is dropped since it was replaced with the Class field, the difficulty field will be dropped as well.
df=df.drop("outcome",axis=1)
df=df.drop("difficulty",axis=1)

# We again split the data into training and test sets.
training_df= df.iloc[:-test_samples_length, :]
testing_df= df.iloc[-test_samples_length:,:]

In [None]:
# Training Dataset
training_df.head()

In [7]:
# Helper function for scaling continous values
def minmax_scale_values(training_df,testing_df, col_name):
    scaler = MinMaxScaler()
    scaler = scaler.fit(training_df[col_name].values.reshape(-1, 1))
    train_values_standardized = scaler.transform(training_df[col_name].values.reshape(-1, 1))
    training_df[col_name] = train_values_standardized
    test_values_standardized = scaler.transform(testing_df[col_name].values.reshape(-1, 1))
    testing_df[col_name] = test_values_standardized
    
    
#Helper function for one hot encoding
def encode_text(training_df,testing_df, name):
    training_set_dummies = pd.get_dummies(training_df[name])
    testing_set_dummies = pd.get_dummies(testing_df[name])
    for x in training_set_dummies.columns:
        dummy_name = "{}_{}".format(name, x)
        training_df[dummy_name] = training_set_dummies[x]
        if x in testing_set_dummies.columns :
            testing_df[dummy_name]=testing_set_dummies[x]
        else :
            testing_df[dummy_name]=np.zeros(len(testing_df))
    training_df.drop(name, axis=1, inplace=True)
    testing_df.drop(name, axis=1, inplace=True)
    
    
sympolic_columns=["protocol_type","service","flag"]
label_column="Class"
for column in df.columns :
    if column in sympolic_columns:
        encode_text(training_df,testing_df,column)
    elif not column == label_column:
        minmax_scale_values(training_df,testing_df, column)

In [None]:
# Training Dataset after one-hot encoding
training_df.head()

In [None]:
training_df.Class.value_counts()

In [None]:
# Creating final dataset
validation_df = training_df[101000:]
training_df = training_df[:101000]

training_df = training_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

print(validation_df)

In [None]:
x_train, y_train, y0_train = conv_array(training_df)
print(y_train[0], y0_train[0])

x_valid, y_valid, y0_valid = conv_array(validation_df)
print(y_valid[1], y0_valid[1])

x_test,y_test,y0_test = conv_array(testing_df)
print(y_test[0], y0_test[0])

In [None]:
testing_df.to_csv('Dataset\test_prepared.csv', index=False)

## Training Part starts from here

### Random Forest Model

In [None]:
# Loading the model
from sklearn.ensemble import RandomForestClassifier
print('*********************************')
print("now working on the 1rd model")

random_forest_model = RandomForestClassifier(n_estimators=100)

# Training the model
random_forest_model.fit(x_train, y0_train)
print("Model has been trained.")

In [None]:
y0_test[10]

In [None]:
random_forest_model.predict_proba(x_test[10].reshape(1,122))

In [None]:
# Using model for predictions

dict = {0:"Normal    ", 1:"dos ", 2:"u2r", 3:"r2l", 4:"probe"}

y_pred = random_forest_model.predict(x_test)
print("Prediction | Expected")
print("----------------------")
for i in range(10):
    print(dict[y_pred[i]],"|",y_test[i])

In [None]:
# Analysing the model's predictions
result = random_forest_model.score(x_test, y0_test)
print(result)

accuracy=accuracy_score(y0_test,y_pred)
recall=recall_score(y0_test,y_pred,average='micro')
precision=precision_score(y0_test,y_pred,average='micro')
f1=f1_score(y0_test,y_pred,average='micro')
print("Performance over the testing data set of Random Forest ")
print("Accuracy : {} , Recall : {} , Precision : {} ,       F1 : {}\n".format(accuracy,recall,precision,f1 ))

In [None]:
save_model(random_forest_model, "random_forest_model.sav")

### Support Vector Machine Model

In [None]:
from sklearn.svm import SVC

print('*********************************')
print("now working on the 2rd model")
# Loading the model
svm_model = SVC(kernel='linear',probability=True)

In [None]:
# Training the model
svm_model.fit(x_train, y0_train)
print("Model has been trained.")

y0_test[10]

svm_model.predict_proba(x_test[10].reshape(1, 122))

In [None]:
# Using model for predictions

dict = {0: "Normal    ", 1: "dos ", 2: "u2r", 3: "r2l", 4: "probe"}

y_pred = svm_model.predict(x_test)
print("Prediction | Expected")
print("----------------------")
for i in range(10):
    print(dict[y_pred[i]], "|", y_test[i])

In [None]:
# Analysing the model's predictions
result = svm_model.score(x_test, y0_test)
print(result)

accuracy = accuracy_score(y0_test, y_pred)
recall = recall_score(y0_test, y_pred, average='micro')
precision = precision_score(y0_test, y_pred, average='micro')
f1 = f1_score(y0_test, y_pred, average='micro')
print("Performance over the testing data set of the SVM model ")
print("Accuracy : {}, Recall : {}, Precision : {},     F1 : {}\n".format(accuracy, recall, precision, f1))

In [None]:
save_model(svm_model, "Linear_SVM_model.sav")

### Adaboost Model

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import joblib

print('*********************************')
print("now working on the 3rd model")
# Loading the XGBoost model
xgboost_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')


In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': range(50, 300, 50),  # Range of values to try
    'learning_rate': [0.01, 0.1, 0.2],  # Try different learning rates
    'max_depth': [3, 5, 7],  # Max depth of the tree
    'random_state': [42]  # Keep the random state fixed
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgboost_model, param_grid=param_grid, scoring='f1_micro', cv=5)

# Fit GridSearchCV to training data
grid_search.fit(x_train, y0_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

In [None]:
# Train the final XGBoost model with the best parameters
optimized_xgboost_model = grid_search.best_estimator_
optimized_xgboost_model.fit(x_train, y0_train)
print("Optimized model has been trained.")

In [None]:
# Make predictions
y_pred = optimized_xgboost_model.predict(x_test)

In [None]:
dict = {0: "Normal    ", 1: "dos ", 2: "u2r", 3: "r2l", 4: "probe"}

print("Prediction | Expected")
print("----------------------")
for i in range(10):
    print(dict[y_pred[i]], "|", y_test[i])


In [None]:
# Evaluate the optimized model
accuracy = accuracy_score(y0_test, y_pred)
recall = recall_score(y0_test, y_pred, average='micro')
precision = precision_score(y0_test, y_pred, average='micro')
f1 = f1_score(y0_test, y_pred, average='micro')

print("Performance over the testing data set  of XGBoost_model \n")
print("Accuracy : {}, Recall : {}, Precision : {}, F1 : {}\n".format(accuracy, recall, precision, f1))


In [None]:
# Save the optimized model
joblib.dump(optimized_xgboost_model, "Optimized_XGBoost_model.sav")
print("Optimized XGBoost model saved as Optimized_XGBoost_model.sav")

### RF_meta_model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV

print('*********************************')
print("now working on the metamodel")
# Generate predictions from base models
svm_predictions = svm_model.predict_proba(x_valid)
rfc_predictions = random_forest_model.predict_proba(x_valid)
adaboost_predictions = optimized_xgboost_model.predict_proba(x_valid)

# Combine base model predictions as meta features
meta_features = np.hstack((svm_predictions, rfc_predictions, adaboost_predictions))

In [None]:
# Hyperparameter optimization for the meta-model (Random Forest)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

meta_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=meta_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(meta_features, y0_valid)

# Best parameters and meta-model
best_params = grid_search.best_params_
print("Best hyperparameters for the meta-model:", best_params)
meta_model = grid_search.best_estimator_


In [None]:
# Evaluate the meta-model
test_svm_predictions = svm_model.predict_proba(x_test)
test_rfc_predictions = random_forest_model.predict_proba(x_test)
test_adaboost_predictions = optimized_xgboost_model.predict_proba(x_test)

# Combine test meta features
test_meta_features = np.hstack((test_svm_predictions, test_rfc_predictions, test_adaboost_predictions))

In [None]:
y_meta_pred = meta_model.predict(test_meta_features)

# Evaluate the meta-model
dict = {0: "Normal    ", 1: "dos ", 2: "u2r", 3: "r2l", 4: "probe"}
print("Prediction | Expected")
print("----------------------")
for i in range(10):
    print(dict[y_meta_pred[i]], "|", y_test[i])


In [None]:
accuracy = accuracy_score(y0_test, y_meta_pred)
recall = recall_score(y0_test, y_meta_pred, average='micro')
precision = precision_score(y0_test, y_meta_pred, average='micro')
f1 = f1_score(y0_test, y_meta_pred, average='micro')

print("Performance of the meta-model over the testing data set \n")
print("Accuracy : {}, Recall : {}, Precision : {}, F1 : {}\n".format(accuracy, recall, precision, f1))

In [None]:
# Save the meta-model
joblib.dump(meta_model, "Meta_RF_model_optimized.sav")
print("Optimized meta-model saved as Meta_RF_model_optimized.sav")