In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import RFE
import itertools
import time
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import cross_val_score
from tabulate import tabulate
import optuna
from joblib import dump
from joblib import load




#### Data Loading and Initial Exploration


In [2]:
# Define the column names for the dataset
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "class", "difficulty_level"
]

In [3]:
# load Train data
train_data = pd.read_csv('KDDTrain+.txt', header=None, names=columns)
# load Test data
test_data = pd.read_csv('KDDTest+.txt', header=None, names=columns)


In [4]:
# we will use the following features to train our model
train_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [5]:
# we will use the following features to test our model
test_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty_level
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [6]:
# General information and statistics about the train data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 

In [7]:
# describe the train data
train_data.describe(include='object')

Unnamed: 0,protocol_type,service,flag,class
count,125973,125973,125973,125973
unique,3,70,11,23
top,tcp,http,SF,normal
freq,102689,40338,74945,67343


In [8]:
train_data.protocol_type.unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

#### Data Cleaning and Preprocessing

In [9]:
# Checking for missing values and duplicates
missing_values = train_data.isnull().sum()
total = train_data.shape[0]
missing_columns = [col for col in train_data.columns if train_data[col].isnull().sum() > 0]
for col in missing_columns:
    null_count = train_data[col].isnull().sum()
    per = (null_count/total) * 100
    print(f"{col}: {null_count} ({round(per, 3)}%)")

In [10]:
# Checking for duplicate rows
print(f"Number of duplicate rows: {train_data.duplicated().sum()}")

Number of duplicate rows: 0


In [11]:
print('Class distribution Training set:')
print(train_data['class'].value_counts())

Class distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: class, dtype: int64


In [12]:
# Encoding categorical features
def label_encode(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])

label_encode(train_data)
label_encode(test_data)


In [13]:
# why drop this column?
# The column num_outbound_cmds has only one unique value, so it is not useful for our model
train_data.drop(['num_outbound_cmds'], axis=1, inplace=True)
test_data.drop(['num_outbound_cmds'], axis=1, inplace=True)
# drop hot column
train_data.drop(['hot'], axis=1, inplace=True)
test_data.drop(['hot'], axis=1, inplace=True)
# drop difficulty_level column
train_data.drop(['difficulty_level'], axis=1, inplace=True)
test_data.drop(['difficulty_level'], axis=1, inplace=True)

#### Feature Selection and Normalization

In [14]:
# Splitting data into features and target
X_train = train_data.drop(['class'], axis=1)
Y_train = train_data['class']

In [15]:
# What is happening in this code
# Random Forest Classifier is used to rank the importance of features
# RFE is used to select the most important features
# we will use 5 features in our model
# we will use the selected features to train our model
# we will use the selected features to test our model

# Feature selection using Random Forest Classifier
rfc = DecisionTreeClassifier()  # Using Decision Tree for feature selection
rfe = RFE(rfc, n_features_to_select=5)
rfe = rfe.fit(X_train, Y_train)

# Selecting important features
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), X_train.columns)]
selected_features = [v for i, v in feature_map if i==True]
selected_features

['protocol_type',
 'src_bytes',
 'count',
 'same_srv_rate',
 'dst_host_diff_srv_rate']

 Protocol_type: Protocol used in the connection

 Src_bytes: Number of data bytes transferred from source to destination in single connection

Count: Number of connections to the same destination host as the current connection in the past two seconds

Same_srv_rate: The percentage of connections that were to the same service 

Dst_host_diff_ srv_rate: The percentage of connections that were to different services 

In [16]:
# Train data with selected features
X_train = X_train[selected_features]

In [17]:
# Scaling features why? and what is the purpose of scaling?
# Scaling is used to standardize the range of independent variables or features of the data
# StandardScaler is used to scale the features
# we will use the scaled features to train our model
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
test = scale.fit_transform(test_data)


In [18]:

# Splitting the dataset for training and testing
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, train_size=0.70, random_state=42)

#### Decision Tree Model Training and Tuning

In [19]:

# Training a basic Decision Tree Classifier
# Time to train the model
clfd = DecisionTreeClassifier(criterion ="entropy", max_depth = 4)
start_time = time.time()
clfd.fit(x_train, y_train.values.ravel())
end_time = time.time()
print("Training time: ", end_time-start_time)

Training time:  0.1385810375213623


In [20]:
# Time taken to test the model
start_time = time.time()
y_test_pred = clfd.predict(x_train)
end_time = time.time()
print("Testing time: ", end_time-start_time)

Testing time:  0.017382144927978516


In [21]:

# Hyperparameter Tuning for Decision Tree using Optuna
def objective(trial):
    dt_max_depth = trial.suggest_int('dt_max_depth', 2, 32, log=False)
    dt_max_features = trial.suggest_int('dt_max_features', 2, 5, log=False)
    classifier_obj = DecisionTreeClassifier(max_features=dt_max_features, max_depth=dt_max_depth)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy


In [22]:
# start the optimization process
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective, n_trials=30)
print(study_dt.best_trial)

[I 2024-02-06 23:37:41,349] A new study created in memory with name: no-name-2bff4663-f9a7-4f37-870f-9aa505c696b4
[I 2024-02-06 23:37:41,451] Trial 0 finished with value: 0.9801809906858594 and parameters: {'dt_max_depth': 26, 'dt_max_features': 3}. Best is trial 0 with value: 0.9801809906858594.
[I 2024-02-06 23:37:41,558] Trial 1 finished with value: 0.979625317527519 and parameters: {'dt_max_depth': 10, 'dt_max_features': 4}. Best is trial 0 with value: 0.9801809906858594.
[I 2024-02-06 23:37:41,649] Trial 2 finished with value: 0.9804985182049111 and parameters: {'dt_max_depth': 11, 'dt_max_features': 4}. Best is trial 2 with value: 0.9804985182049111.
[I 2024-02-06 23:37:41,724] Trial 3 finished with value: 0.9755503810330228 and parameters: {'dt_max_depth': 10, 'dt_max_features': 3}. Best is trial 2 with value: 0.9804985182049111.
[I 2024-02-06 23:37:41,759] Trial 4 finished with value: 0.9333192209991532 and parameters: {'dt_max_depth': 4, 'dt_max_features': 2}. Best is trial 2 

FrozenTrial(number=12, state=TrialState.COMPLETE, values=[0.9811071126164268], datetime_start=datetime.datetime(2024, 2, 6, 23, 37, 42, 283278), datetime_complete=datetime.datetime(2024, 2, 6, 23, 37, 42, 372570), params={'dt_max_depth': 13, 'dt_max_features': 4}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'dt_max_depth': IntDistribution(high=32, log=False, low=2, step=1), 'dt_max_features': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=12, value=None)


In [23]:

# Training the Decision Tree model with the best parameters
dt = DecisionTreeClassifier(max_features=study_dt.best_trial.params['dt_max_features'], max_depth=study_dt.best_trial.params['dt_max_depth'])
dt.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=13, max_features=4)

#### Model Evaluation

In [24]:
# Model Performance Metrics
dt_train, dt_test = dt.score(x_train, y_train), dt.score(x_test, y_test)
print(f"Train Score: {dt_train}")
print(f"Test Score: {dt_test}")


Train Score: 0.984214286524308
Test Score: 0.9808954276037256


In [25]:

# Cross-Validation why we use cross-validation?
# Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
# The goal of cross-validation is to test the model’s ability to predict new data that was not used in estimating it
# we will use cross-validation to evaluate the performance of our model
# we will use 10 folds for cross-validation
# we will use the accuracy metric to evaluate the performance of our model

scores = cross_val_score(dt, x_train, y_train, cv=10, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores)}")



Cross-Validation Accuracy: 0.9810729946536236


In [26]:
# Confusion Matrix and Classification Report why we use these metrics?
# Confusion matrix is used to evaluate the performance of a classification model
# Classification report is used to measure the quality of predictions from a classification algorithm
from sklearn.metrics import confusion_matrix, classification_report, f1_score

y_pred = dt.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



# If you prefer 'micro' average
f1_micro = f1_score(y_test, y_pred, average='micro')
print(f"F1 Score (Micro): {f1_micro}")

# If you prefer 'weighted' average
f1_weighted = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score (Weighted): {f1_weighted}")


[[  273     0     0     0     0     0     0     0     0     0     0     3
      0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0    10
      0     0     0     0     0     0     0     0     0     1     0]
 [    0     0     0     0     0     0     0     0     0     0     1     2
      0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0    16     0     0     0     0     0     0     0     1
      0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     3
      0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0  1113     0     0     0     0     0     3
      0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     4
      0     0     0     1     0     0     0     0     0     0     0]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Summary Table

In [27]:

# Creating a summary table of model performance
data = [["Decision Tree", dt_train, dt_test, np.mean(scores), f1_weighted]]
col_names = ["Model", "Train Score", "Test Score", "CV Accuracy", "F1 Score"]
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

╒═══════════════╤═══════════════╤══════════════╤═══════════════╤════════════╕
│ Model         │   Train Score │   Test Score │   CV Accuracy │   F1 Score │
╞═══════════════╪═══════════════╪══════════════╪═══════════════╪════════════╡
│ Decision Tree │      0.984214 │     0.980895 │      0.981073 │   0.978618 │
╘═══════════════╧═══════════════╧══════════════╧═══════════════╧════════════╛


#### Saving the model

In [28]:
# Save the model to a file
model_filename = 'multiclass_decision_tree_model.joblib'
dump(dt, model_filename)
print(f"Model saved to {model_filename}")

Model saved to multiclass_decision_tree_model.joblib


#### Load the model

In [29]:
# Load the model from the file
loaded_model = load(model_filename)
print("Model loaded successfully")

Model loaded successfully


In [1]:

from scapy.all import IP, TCP, UDP, sniff
import numpy as np
import joblib
from collections import defaultdict, Counter
from queue import Queue
from collections import defaultdict, Counter
from scapy.all import IP, TCP, UDP, sniff
import time


# Initialize a queue for thread-safe communication
packet_info_queue = Queue()

def encode_protocol(packet):
    protocol_mapping = {'icmp': 0, 'tcp': 1, 'udp': 2}
    if IP in packet:
        if packet[IP].proto == 1:
            return protocol_mapping['icmp']
        elif packet[IP].proto == 6:
            return protocol_mapping['tcp']
        elif packet[IP].proto == 17:
            return protocol_mapping['udp']
    return -1

class ConnectionTracker:
    def __init__(self):
        self.connections = defaultdict(lambda: {
            'src_bytes': 0,
            'dst_bytes': 0,
            'dst_host_srv_count': 0,
            'dst_host_same_srv_rate': 0,
            'count': 0,
            'dst_host_diff_srv_rate': 0,
            'timestamps': [],
            'services': set()
        })
        self.dst_host_counts = Counter()
        self.service_counts = defaultdict(Counter)

    def update_connection(self, packet):
        if IP in packet and (TCP in packet or UDP in packet):
            src = (packet[IP].src, packet[TCP].sport if TCP in packet else packet[UDP].sport)
            dst = (packet[IP].dst, packet[TCP].dport if TCP in packet else packet[UDP].dport)
            service = packet[IP].dport  # Service is identified by the destination port
            key = src + dst

            if packet[IP].src == src[0]:
                self.connections[key]['src_bytes'] += len(packet)
            else:
                self.connections[key]['dst_bytes'] += len(packet)

            # Update the timestamp list and remove timestamps older than 2 seconds
            current_time = time.time()
            self.connections[key]['timestamps'] = [t for t in self.connections[key]['timestamps'] if current_time - t < 2]
            self.connections[key]['timestamps'].append(current_time)

            # Update the count for connections to the same destination host
            self.connections[key]['count'] = len(self.connections[key]['timestamps'])

            # Update services seen for this destination host
            self.connections[key]['services'].add(service)
            self.service_counts[dst[0]][service] += 1

            # Calculate dst_host_diff_srv_rate
            total_services = sum(self.service_counts[dst[0]].values())
            diff_services = len(self.service_counts[dst[0]])
            self.connections[key]['dst_host_diff_srv_rate'] = diff_services / total_services if total_services > 0 else 0

            # Calculate dst_host_same_srv_rate (assuming it's the rate of the same service)
            same_service_count = self.service_counts[dst[0]][service]
            self.connections[key]['dst_host_same_srv_rate'] = same_service_count / total_services if total_services > 0 else 0

class NetworkTrafficAnalysis:
    def __init__(self, model_path, attack_types, label_mapping):
        self.model = joblib.load(model_path)
        self.attack_types = attack_types
        self.label_mapping = label_mapping
        self.tracker = ConnectionTracker()


    def process_packet(self, packet):
        if IP not in packet:
            return

        protocol_type = encode_protocol(packet)
        self.tracker.update_connection(packet)

        for key, stats in self.tracker.connections.items():
            # Assuming 'count' and 'dst_host_diff_srv_rate' are calculated within update_connection
            count = stats.get('count', 0)
            dst_host_diff_srv_rate = stats.get('dst_host_diff_srv_rate', 0)
            features = np.array([[protocol_type, stats['src_bytes'], count, 
                              stats['dst_host_same_srv_rate'], dst_host_diff_srv_rate]])
            numerical_prediction = self.model.predict(features)[0]
            specific_category = self.label_mapping[numerical_prediction]  # Translate to string label  # Translate to string label
            broader_category = self.attack_types.get(specific_category, "Unknown")
            output = (
            f"'protocol_type'= {protocol_type}, "
            f"'src_bytes' = {stats['src_bytes']}, "
            f"'count'= {count}, "
            f"'same_srv_rate'= {stats['dst_host_same_srv_rate']}, "
            f"'dst_host_diff_srv_rate'= {dst_host_diff_srv_rate}, "
            f"Specific Prediction: {specific_category}, "
            f"Broader Category: {broader_category}"
        )
            print(output)
            packet_info_queue.put(output)

    def start_capture(self):
        sniff(prn=self.process_packet, store=False)
 

if __name__ == "__main__":
    model_path = 'multiclass_decision_tree_model.joblib'
    attack_types = {
        'normal': 'Normal',
        'back': 'DoS',
        'buffer_overflow': 'U2R',
        'ftp_write': 'R2L',
        'guess_passwd': 'R2L',
        'imap': 'R2L',
        'ipsweep': 'Probe',
        'land': 'DoS',
        'loadmodule': 'U2R',
        'multihop': 'R2L',
        'neptune': 'DoS',
        'nmap': 'Probe',
        'perl': 'U2R',
        'phf': 'R2L',
        'pod': 'DoS',
        'portsweep': 'Probe',
        'rootkit': 'U2R',
        'satan': 'Probe',
        'smurf': 'DoS',
        'spy': 'R2L',
        'teardrop': 'DoS',
        'warezclient': 'R2L',
        'warezmaster': 'R2L'}
    label_mapping = {
        0: 'normal',
        1: 'back',
        2: 'buffer_overflow',
        3: 'ftp_write',
        4: 'guess_passwd',
        5: 'imap',
        6: 'ipsweep',
        7: 'land',
        8: 'loadmodule',
        9: 'multihop',
        10: 'neptune',
        11: 'nmap',
        12: 'perl',
        13: 'phf',
        14: 'pod',
        15: 'portsweep',
        16: 'rootkit',
        17: 'satan',
        18: 'smurf',
        19: 'spy',
        20: 'teardrop',
        21: 'warezclient',
        22: 'warezmaster'
    }
   
    
    analysis_system = NetworkTrafficAnalysis(model_path, attack_types,label_mapping)
    analysis_system.start_capture()




'protocol_type'= 2, 'src_bytes' = 205, 'count'= 1, 'same_srv_rate'= 1.0, 'dst_host_diff_srv_rate'= 1.0, Specific Prediction: portsweep, Broader Category: Probe
'protocol_type'= 2, 'src_bytes' = 205, 'count'= 1, 'same_srv_rate'= 1.0, 'dst_host_diff_srv_rate'= 1.0, Specific Prediction: portsweep, Broader Category: Probe
'protocol_type'= 2, 'src_bytes' = 112, 'count'= 1, 'same_srv_rate'= 1.0, 'dst_host_diff_srv_rate'= 1.0, Specific Prediction: portsweep, Broader Category: Probe
'protocol_type'= 2, 'src_bytes' = 205, 'count'= 1, 'same_srv_rate'= 1.0, 'dst_host_diff_srv_rate'= 1.0, Specific Prediction: portsweep, Broader Category: Probe
'protocol_type'= 2, 'src_bytes' = 732, 'count'= 2, 'same_srv_rate'= 1.0, 'dst_host_diff_srv_rate'= 0.5, Specific Prediction: portsweep, Broader Category: Probe
'protocol_type'= 2, 'src_bytes' = 205, 'count'= 1, 'same_srv_rate'= 1.0, 'dst_host_diff_srv_rate'= 1.0, Specific Prediction: portsweep, Broader Category: Probe
'protocol_type'= 2, 'src_bytes' = 1371, 