<a href="https://colab.research.google.com/github/Maher1410/tteesstt/blob/main/VC_IDS_Voting_Classifier_Intrusion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
hassan06_nslkdd_path = kagglehub.dataset_download('hassan06/nslkdd')

print('Data source import complete.')


In [None]:
# importing required libraries
import numpy as np
import pandas as pd
import shutil
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout,Conv2D, MaxPooling2D,BatchNormalization, Flatten,LSTM, Bidirectional,GRU
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras import backend as K
import time
from tqdm import tqdm

In [None]:
# dataset doesn't have column names, so we have to provide it
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]

In [None]:
# importing dataset
data = pd.read_csv('/kaggle/input/nslkdd/KDDTrain+.txt',header=None, names=col_names)

In [None]:
# print dataset
data

In [None]:
# remove attribute 'difficulty_level'
data.drop(['difficulty_level'],axis=1,inplace=True)
data.shape

In [None]:
# descriptive statistics of dataset
data.describe()

In [None]:
# number of attack labels
data['label'].value_counts()

In [None]:
# changing attack labels to their respective attack class
def change_label(df):
    df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
    df.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
    df.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
    df.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)

In [None]:
# calling change_label() function
change_label(data)

In [None]:
# distribution of attack classes
data.label.value_counts()

##  Data Normalization


In [None]:
# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [None]:
# selecting numeric attributes columns from data
numeric_col = data.select_dtypes(include='number').columns

In [None]:
# using standard scaler for normalizing
std_scaler = StandardScaler()
def normalization(df,col):
    for i in col:
        arr = df[i]
        arr = np.array(arr)
        df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
    return df

In [None]:
# data before normalization
data.head()

In [None]:
# calling the normalization() function
data = normalization(data.copy(),numeric_col)

In [None]:
# data after normalization
data.head()

## One-hot-encoding

In [None]:
# selecting categorical data attributes
cat_col = ['protocol_type','service','flag']

In [None]:
# creating a dataframe with only categorical attributes
categorical = data[cat_col]
categorical.head()

In [None]:
# one-hot-encoding categorical attributes using pandas.get_dummies() function
categorical = pd.get_dummies(categorical,columns=cat_col)
categorical.head()

## Binary Classification

In [None]:
# changing attack labels into two categories 'normal' and 'abnormal'
bin_label = pd.DataFrame(data.label.map(lambda x:'normal' if x=='normal' else 'abnormal'))

In [None]:
# creating a dataframe with binary labels (normal,abnormal)
bin_data = data.copy()
bin_data['label'] = bin_label

In [None]:
# label encoding (0,1) binary labels (abnormal,normal)
le1 = preprocessing.LabelEncoder()
enc_label = bin_label.apply(le1.fit_transform)
bin_data['intrusion'] = enc_label

In [None]:
le1.classes_

In [None]:
# dataset with binary labels and label encoded column
bin_data.head()

In [None]:
# one-hot-encoding attack label
bin_data = pd.get_dummies(bin_data,columns=['label'],prefix="",prefix_sep="")
bin_data['label'] = bin_label
bin_data

In [None]:
# importing library for plotting
import matplotlib.pyplot as plt

In [None]:
import matplotlib.pyplot as plt

# Assuming bin_data is your DataFrame and it has a 'label' column
label_counts = bin_data.label.value_counts()

plt.figure(figsize=(8,6))
plt.bar(label_counts.index, label_counts.values, color=['blue', 'red'])
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Bar chart distribution of normal and abnormal labels')
plt.show()

## Feature Extraction

In [None]:
# creating a dataframe with only numeric attributes of binary class dataset and encoded label attribute
numeric_bin = bin_data[numeric_col]
numeric_bin['intrusion'] = bin_data['intrusion']

In [None]:
# finding the attributes which have more than 0.5 correlation with encoded attack label attribute
corr= numeric_bin.corr()
corr_y = abs(corr['intrusion'])
highest_corr = corr_y[corr_y >0.5]
highest_corr.sort_values(ascending=True)

In [None]:
# selecting attributes found by using pearson correlation coefficient
numeric_bin = bin_data[['count','srv_serror_rate','serror_rate','dst_host_serror_rate','dst_host_srv_serror_rate',
                         'logged_in','dst_host_same_srv_rate','dst_host_srv_count','same_srv_rate']]

In [None]:
# joining the selected attribute with the one-hot-encoded categorical dataframe
numeric_bin = numeric_bin.join(categorical)
# then joining encoded, one-hot-encoded, and original attack label attribute
bin_data = numeric_bin.join(bin_data[['intrusion','abnormal','normal','label']])

In [None]:
# saving final dataset to disk
bin_data.to_csv("/kaggle/working//bin_data.csv")
# final dataset for binary classification
bin_data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X = bin_data.iloc[:,0:93].to_numpy() # dataset excluding target attribute (encoded, one-hot-encoded,original)
Y = bin_data['intrusion'] # target attribute

# splitting the dataset 75% for training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
from mlxtend.plotting import plot_confusion_matrix
classifiers = [
               ['LogisticRegression :', LogisticRegression(max_iter = 1000)],
               ['ExtraTreesClassifier :', ExtraTreesClassifier()],
               ['DecisionTree :',DecisionTreeClassifier()],
               ['RandomForest :',RandomForestClassifier()],
               ['Naive Bayes :', GaussianNB()],
               ['KNeighbours :', KNeighborsClassifier()],
               ['SVM :', SVC()]
]

predictions_df = pd.DataFrame()
predictions_df['intrusion'] = y_test

for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train.ravel())
    predictions = classifier.predict(X_test)
    predictions_df[name.strip(" :")] = predictions
    print(name, accuracy_score(y_test, predictions))
    cm = confusion_matrix(y_test, predictions)
    print(cm)
    print(classification_report(y_test, predictions))


In [None]:
from sklearn.ensemble import VotingClassifier
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier()
clf3 = ExtraTreesClassifier()
eclf1 = VotingClassifier(estimators=[('DT', clf1), ('RF', clf2), ('ET', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
vot_acc = accuracy_score(y_test, predictions)
print("Accuracy of Voting Classifier is : ", "{:.2f}%".format(100*vot_acc ))
#plotting the confussion matrix
from mlxtend.plotting import plot_confusion_matrix
cm = confusion_matrix(y_test, predictions)
print(classification_report(y_test, predictions))
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True,cmap=plt.cm.Blues)
plt.title("Voting Classifier Model - Confusion Matrix")
plt.xticks(range(2), ["No Attack","Attack"], fontsize=12)
plt.yticks(range(2), ["No Attack","Attack"], fontsize=12)
plt.show()

In [None]:
eclf2 = VotingClassifier(estimators=[('DT', clf1), ('RF', clf2), ('ET', clf3)], voting='hard')
eclf2.fit(X_train, y_train)
predictions1 = eclf2.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
vot_acc = accuracy_score(y_test, predictions1)
print("Accuracy of Voting Classifier is : ", "{:.2f}%".format(100*vot_acc ))
#plotting the confussion matrix
from mlxtend.plotting import plot_confusion_matrix
cm = confusion_matrix(y_test, predictions1)
print(classification_report(y_test, predictions1))
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True,cmap=plt.cm.Blues)
plt.title("Voting Classifier Model - Confusion Matrix")
plt.xticks(range(2), ["No Attack","Attack"], fontsize=12)
plt.yticks(range(2), ["No Attack","Attack"], fontsize=12)
plt.show()

In [None]:
!pip install lime
!pip install shap

In [None]:
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import shap
from lime import lime_tabular
import numpy as np

# Assuming X_train and X_test are initially numpy arrays
# Convert them to pandas DataFrames
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

# Ensure all data is numeric
X_train_df = X_train_df.apply(pd.to_numeric, errors='coerce')
X_test_df = X_test_df.apply(pd.to_numeric, errors='coerce')

# Convert back to numpy arrays if needed
X_train = X_train_df.values
X_test = X_test_df.values

# Define classifiers and VotingClassifier
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier()
clf3 = ExtraTreesClassifier()
eclf1 = VotingClassifier(estimators=[('DT', clf1), ('RF', clf2), ('ET', clf3)], voting='soft')

# Fit the VotingClassifier
eclf1.fit(X_train, y_train)

# Make predictions
predictions = eclf1.predict(X_test)
print(classification_report(y_test, predictions))

# LIME explanations
explainer_lime = lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=X_train_df.columns,
    class_names=['abnormal', 'normal'],
    mode='classification'
)

exp = explainer_lime.explain_instance(
    data_row=X_test[0],
    predict_fn=eclf1.predict_proba
)

exp.show_in_notebook(show_table=True, show_all=False)
