# Centralised Learning and Federated Learning on the CICIoT2023 dataset

This notebook extends on the functionality of the CICIoT2023 example notebook, to account for improvement to the centralised training of all data instances.

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm
import warnings
#warnings.filterwarnings('ignore')


In [3]:
DATASET_DIRECTORY = 'datasets/'

Include the defines for the dataframe columns and the attack labels and their mappings

In [4]:
from includes import X_columns, y_column, dict_34_classes, dict_8_classes, dict_7_classes, dict_2_classes

In [5]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# # Create the training and test sets
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

# TODO - REMOVE THIS - Works on 20% of the data for low memory machines
# Create the training and test sets - LOW MEMORY CLUDGE FOR JON
# training_sets = df_sets[:int(len(df_sets)*.2)]
# test_sets = df_sets[int(len(df_sets)*.8):]

---
# TEMP CODE

In [6]:
# Set training_sets to the last entry of training_sets
training_sets = training_sets[-1:]
print(f"HACK TO REPLICATE ORIGINAL AUTHORS CODE WITH ONE FILE TRAIN - {training_sets}")

HACK TO REPLICATE ORIGINAL AUTHORS CODE WITH ONE FILE TRAIN - ['part-00136-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']


Remove this if you have more than a morsel of memory

---

# Create a new DataFrame that consists of all CSV datA

This is **memory intensive** as it will create a DataFrame with 36 million rows.

In [7]:
# Depreciated method
# df = []

# count = 0
# for train_set in tqdm(training_sets):
#     if count == 0:
#         df = pd.read_csv(DATASET_DIRECTORY + train_set)
#     else:
#         df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
#         df = df.append(df_new, ignore_index=True)
#     count = count + 1

In [8]:
# New faster method not using depreciated pandas append
dfs = []
for train_set in tqdm(training_sets):
    df_new = pd.read_csv(DATASET_DIRECTORY + train_set)
    dfs.append(df_new)
df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


In [9]:
df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.025926,56.06,6.00,63.82,3.528480,3.528480,0.0,0.0,1.0,0.0,...,0.496995,54.20,8.298108e+07,9.5,10.405144,0.704371,2.498950,0.10,141.55,DoS-SYN_Flood
1,0.000000,54.00,6.00,64.00,26.996367,26.996367,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308982e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-SYN_Flood
2,0.000000,54.00,6.00,64.00,2.986424,2.986424,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334383e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-RSTFINFlood
3,0.000000,53.46,5.94,63.36,66.531372,66.531372,0.0,0.0,1.0,0.0,...,0.131850,54.06,8.309404e+07,9.5,10.395650,0.186918,0.195153,0.09,141.55,DDoS-SYN_Flood
4,0.000000,81.00,6.00,64.00,10.473893,10.473893,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.292583e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DoS-TCP_Flood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444699,0.366179,35971.00,17.00,64.00,2904.981161,2904.981161,0.0,0.0,0.0,0.0,...,0.000000,50.00,8.309761e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,DDoS-UDP_Flood
444700,0.000000,0.00,1.00,64.00,2.156362,2.156362,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.312488e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,DDoS-ICMP_Flood
444701,0.000000,54.00,6.00,64.00,18.178045,18.178045,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.306730e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-TCP_Flood
444702,1.937804,1935410.08,17.00,64.00,1762.710435,1762.710435,0.0,0.0,0.0,0.0,...,0.000000,554.00,8.378930e+07,9.5,33.286634,0.000000,0.000000,0.00,141.55,Mirai-udpplain


## Map the y labels to integers

In [10]:
# Map y column to the dict_34_classes values
df['label'] = df['label'].map(dict_34_classes)

# Save this output to a Pickle file

In [11]:
df.to_pickle('training_data.pkl')

We can now retrieve the dataset from the pkl in further work (pickle file approx 2GB compared to 12GB of CSV data).

---

# Read the pickle file


In [12]:
# Read the pickle file
df = pd.read_pickle('training_data.pkl')

# Scale the input features

In [13]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
df[X_columns] = scaler.fit_transform(df[X_columns])

# Classification Problem (2-class, 8-class, or 34-class)
Select which size classification problem you want to solve.

In [14]:
binary_classifier = True
group_classifier = False
individual_classifier = False

if group_classifier:
    print("Group 8 Class Classifier...")
    # Map y column to the dict_7_classes values
    df['label'] = df['label'].map(dict_8_classes)
    class_size = "8"
        
elif binary_classifier:
    print("Binary 2 Class Classifier...")
    # Map y column to the dict_2_classes values
    df['label'] = df['label'].map(dict_2_classes)
    class_size = "2"

else:
    print ("Individual 34 Class classifier...")
    class_size = "34"
    
    

Binary 2 Class Classifier...


# Model Creation (LR, RF, MLP)

In [15]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pickle
from datetime import datetime

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

ML_models = [
    ("LogisticRegression", LogisticRegression(n_jobs=-1), f"logreg-{class_size}class-model.pkl"),
    ("RandomForestClassifier", RandomForestClassifier(), f"rf-{class_size}class-model.pkl"),
    ("MLPClassifier", MLPClassifier(), f"mlp-{class_size}class-model.pkl")
]

def train_and_evaluate(name, model, model_file, df):
    print(datetime.now(), f" : Fit {name} model...")
    model.fit(df[X_columns], df[y_column])
    print(datetime.now(), f" : Fit {name} model complete...")
    
    with open(model_file, "wb") as f:
        pickle.dump(model, f)
    
    y_test = []
    preds = []
    for test_set in tqdm(test_sets):
        d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
        d_test[X_columns] = scaler.transform(d_test[X_columns])

        # Always map the y column to the dict_34_classes values
        new_y = [dict_34_classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y

        if binary_classifier:
            # binary classifier (2-class)
            new_y = [dict_2_classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        elif group_classifier:
            # group classifier (8-class)
            new_y = [dict_8_classes[k] for k in d_test[y_column]]
            d_test[y_column] = new_y

        else:
            # individual_classifier
            pass

        y_test += list(d_test[y_column].values)

        y_pred = list(model.predict(d_test[X_columns]))
        preds += y_pred

    print(f"##### {name} ({class_size} classes) #####")
    print('accuracy_score: ', accuracy_score(preds, y_test))
    print('recall_score: ', recall_score(preds, y_test, average='macro'))
    print('precision_score: ', precision_score(preds, y_test, average='macro'))
    print('f1_score: ', f1_score(preds, y_test, average='macro'))
    print('\n')

for name, model, model_file in ML_models:
    train_and_evaluate(name, model, model_file, df)

2024-05-01 09:13:44.741381  : Fit LogisticRegression model...
2024-05-01 09:13:50.522883  : Fit LogisticRegression model complete...


100%|██████████| 32/32 [00:54<00:00,  1.70s/it]


##### LogisticRegression (2 classes) #####
accuracy_score:  0.9888853379374651
recall_score:  0.8882461614760764
precision_score:  0.8623430643500973
f1_score:  0.8748222701094408


2024-05-01 09:15:14.786857  : Fit RandomForestClassifier model...
2024-05-01 09:15:51.973362  : Fit RandomForestClassifier model complete...


100%|██████████| 32/32 [01:25<00:00,  2.68s/it]


##### RandomForestClassifier (2 classes) #####
accuracy_score:  0.9970649073646985
recall_score:  0.9643603701206325
precision_score:  0.9723476381512665
f1_score:  0.9683181534689906


2024-05-01 09:17:47.106997  : Fit MLPClassifier model...
2024-05-01 09:19:42.845142  : Fit MLPClassifier model complete...


100%|██████████| 32/32 [01:09<00:00,  2.18s/it]


##### MLPClassifier (2 classes) #####
accuracy_score:  0.993780921670039
recall_score:  0.9240563284796784
precision_score:  0.944157769101246
f1_score:  0.9338613592092939


CPU times: total: 4min 11s
Wall time: 7min 52s


# Load in a Pickled model result

In [17]:
with open("training_data.pkl", "rb") as f:
    model = pickle.load(f)

# Calculate Test Performance metrics

In [18]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])

    if binary_classifier:
        # binary classifier (2-class)
        new_y = [dict_2_classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y


    elif group_classifier:
        # group classifier (8-class)
        new_y = [dict_7_classes[k] for k in d_test[y_column]]
        d_test[y_column] = new_y

    else:
        # individual_classifier
        pass

    y_test += list(d_test[y_column].values)

    y_pred = list(model.predict(d_test[X_columns]))
    preds[0] = preds[0] + y_pred

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_names[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

  0%|          | 0/32 [00:01<?, ?it/s]


KeyError: 'Mirai-greip_flood'