# Network Traffic Classification Models

# Dataset Description: 
An extensive **IoT attack** dataset. Data consists of 33 Attacks which were executed in an IoT topology composed of 105 devices. These attacks are classified into seven categories, namely **DDoS, DoS, Recon, Web-based, Brute Force, Spoofing, and Mirai**. All attacks were executed by malicious IoT devices targeting other IoT devices.


In [1]:
# First things first, some general imports:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
# Note that the data comes in a set of 169 csv files of the naming convention "part-00xxx-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv" where xxx ranges from 000 to 168
# So we must load and merge all of the csv files into one large dataset

# Loop through each csv file in the directory, read into pandas dataframe and append it to the list of sub-datasets
dataset_path = '../data/CICIot2023/'
df_sets = [k for k in os.listdir(dataset_path) if k.endswith('.csv')]
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.2):]

## First, we will employ a lightweight "screening" classifier which categorizes traffic as either benign of malicious

In [3]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label' 

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

In [None]:
for train_set in tqdm(training_sets):
    scaler.fit(pd.read_csv(dataset_path + train_set)[X_columns])

 30%|███████████████████▋                                             | 41/135 [00:28<01:14,  1.26it/s]

In [None]:
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.naive_bayes import GaussianNB
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#from sklearn.svm import SVC
#from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

In [None]:
names = [
    "Multinomial Naive Bayes",
    "Bernoulli Naive Bayes",
    "Perceptron",
    "SGDClassifier",
    "PassiveAggressiveClassifier"
]

classifiers = [
    MultinomialNB(),
    BernoulliNB(),
    Perceptron(tol=1e3, random_state=0),
    SGDClassifier(),
    PassiveAggressiveClassifier(max_iter=2000, random_state=0, tol=1e3)
]

for model in (classifiers):
    # Introduce first chunk flag
    first_chunk=True
    for train_set in tqdm(training_sets):
        # We need to implement incremental learning due to dataset being *much* larger than RAM
        # Read data into chunks
        reader = pd.read_csv(dataset_path + train_set, iterator=True, chunksize=300000)
        #d[X_columns] = scaler.transform(d[X_columns]
        # Incremental Learning
        for chunk in reader:
            X = chunk[X_columns]
            y = chunk[y_column]

            if first_chunk:
                # Pass the classes parameter during the first call to partial_fit()
                model.partial_fit(X, y, classes=np.unique(y))
                first_chunk=False
            else:
                model.partial_fit(X, y)
        del reader

In [None]:
y_test = []
preds = {i:[] for i in range(len(names))}

for i in range(len(classifiers)):
    model = classifiers[i]
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(dataset_path + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])
        y_test += list(d_test[y_column].values)
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {Model_names[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

In [None]:
# Basic Ensemble technique: Max Voting
from sklearn.ensemble import VotingClassifier
ensemble1 = VotingClassifier(estimators=[('MNB', classifiers[0]), ('BNB', classifiers[1]), ('Per', classifiers[2]), ('SGDC', classifiers[3]), ('PAC', classifiers[4])], voting='hard')

# Train Ensemble Model

# Introduce first chunk flag
first_chunk=True
for train_set in tqdm(training_sets):
    # We need to implement incremental learning due to dataset being *much* larger than RAM
    # Read data into chunks
    reader = pd.read_csv(dataset_path + train_set, iterator=True, chunksize=300000)
    # Incremental Learning
    for chunk in reader:
        X = chunk[X_columns]
        y = chunk[y_column]

        if first_chunk:
            # Pass the classes parameter during the first call to partial_fit()
            model.partial_fit(X, y, classes=np.unique(y))
            first_chunk=False
        else:
            model.partial_fit(X, y)
    del reader

# Test Ensemble Model
