# Training a ML model using CICIoT2023

This notebook shows how a LogisticRegression model can be trained using the CICIoT2023 csv files.

In [33]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
import random
SEED = 42  # Set to None for randomness
if SEED:
    random.seed(SEED)
    print(f"INFO: Using seed {SEED}")
else:
    print(f"Using random seed")

# Manual train/test splitting
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression

INFO: Using seed 42


In [34]:
DATASET_DIRECTORY = '../CICIoT2023/'

### Importing Dataset

In [35]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
df_sets = df_sets
test_indexes_count = int(len(df_sets)*0.2)//4
train_indexes_count = int(len(df_sets)*0.8)//4
index_range = range(len(df_sets))
test_indexes = random.sample(index_range, test_indexes_count)
train_indexes = [i for i in index_range if i not in test_indexes]
train_indexes = train_indexes[:train_indexes_count]
training_sets, test_sets = [], []
for i in train_indexes:
    training_sets.append(df_sets[i])
for i in test_indexes:
    test_sets.append(df_sets[i])
# training_sets = df_sets[:int(len(df_sets)*.8)]
# test_sets = df_sets[int(len(df_sets)*.8):]

In [36]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

### Scaling

In [37]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import joblib
scaler = StandardScaler()

In [38]:
scaler_filename = f"precomputed/scaler-{SEED}.save"

if os.path.exists(scaler_filename):
    scaler = joblib.load(scaler_filename) 
else:
    scaler = StandardScaler()
    for train_set in tqdm(train_indexes):
        scaler.fit(pd.read_csv(DATASET_DIRECTORY + '/' + df_sets[train_set])[X_columns])

    joblib.dump(scaler, scaler_filename)

### Classification: 34 (33+1) classes

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier


ML_models = [
        SGDClassifier(),  # or 'rbf', 'poly' for different kernels
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        HistGradientBoostingClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        GaussianNB(),
]

ML_neams = [
        'SGD',
        'Decision Tree',
        'Random Forest',
        'Gradient boosting',
        'K Neighbors',
        'Gaussian NB',
]

for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    for i, model in enumerate(ML_models):
        print(f"Currently training {ML_neams[i]}")
        model.fit(d[X_columns], d[y_column])
    del d

In [40]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        

100%|██████████| 8/8 [12:31<00:00, 93.95s/it] 


In [50]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print("### 34 CLASSES ###")
print("Model;accuracy;recall;precision;f1")
for k,v in preds.items():
    y_pred = v
    print(f"{ML_neams[k]};{accuracy_score(y_pred, y_test)};{recall_score(y_pred, y_test, average='macro')};{precision_score(y_pred, y_test, average='macro')};{f1_score(y_pred, y_test, average='macro')}")
    pass

print('\n\n')

### 34 CLASSES ###
Model;accuracy;recall;precision;f1


KeyboardInterrupt: 

# Classification: 8 (7+1) classes

In [42]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

ML_models = [
        SGDClassifier(),  # or 'rbf', 'poly' for different kernels
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        HistGradientBoostingClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        GaussianNB(),
]

ML_neams = [
        'SGD',
        'Decision Tree',
        'Random Forest',
        'Gradient boosting',
        'K Neighbors',
        'Gaussian NB',
]

for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_7classes[k] for k in d[y_column]]
    d[y_column] = new_y
    
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

100%|██████████| 33/33 [16:48<00:00, 30.57s/it]


In [44]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_7classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y
    
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        

100%|██████████| 8/8 [11:45<00:00, 88.21s/it] 


In [45]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print("### 8 classes ###")
print("Model;accuracy;recall;precision;f1")
for k,v in preds.items():
    y_pred = v
    print(f"{ML_neams[k]};{accuracy_score(y_pred, y_test)};{recall_score(y_pred, y_test, average='macro')};{precision_score(y_pred, y_test, average='macro')};{f1_score(y_pred, y_test, average='macro')}")

print('\n\n')

### 8 classes ###
Model;accuracy;recall;precision;f1
SGD;0.8226639149456495;0.6811245916588335;0.4659889369641561;0.4984594743864877
Decision Tree;0.9943979108180679;0.8259435706734759;0.8366712334333916;0.8310149072660082
Random Forest;0.9949772281075909;0.9349286016423369;0.7236331985863027;0.7466459149207663
Gradient boosting;0.9913418712945034;0.6932640538432184;0.6961854664989673;0.691220826030268
K Neighbors;0.9513763443562494;0.7636160449076961;0.630398168095793;0.6557933211619869
Gaussian NB;0.485098286789356;0.5736826482423533;0.48636651971232225;0.3549628881576296


# Classification: 2 (1+1) Classes

In [46]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
ML_models = [
        SGDClassifier(),  # or 'rbf', 'poly' for different kernels
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        HistGradientBoostingClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        GaussianNB(),
]

ML_neams = [
        'SGD',
        'Decision Tree',
        'Random Forest',
        'Gradient boosting',
        'K Neighbors',
        'Gaussian NB',
]

for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y
    
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

100%|██████████| 33/33 [10:25<00:00, 18.97s/it]


In [48]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y
    
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred
        

100%|██████████| 8/8 [11:29<00:00, 86.16s/it] 


In [49]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print("### 2 classes ###")
print("Model;accuracy;recall;precision;f1")
for k,v in preds.items():
    y_pred = v
    print(f"{ML_neams[k]};{accuracy_score(y_pred, y_test)};{recall_score(y_pred, y_test, average='macro')};{precision_score(y_pred, y_test, average='macro')};{f1_score(y_pred, y_test, average='macro')}")

print('\n\n')

### 2 classes ###
Model;accuracy;recall;precision;f1
SGD;0.9870448708755057;0.8832426419556297;0.8145921252913875;0.8453536197171853
Decision Tree;0.9959486786409572;0.9559238201484554;0.9560056709048188;0.9559647416582748
Random Forest;0.9970440519453059;0.9655015434453357;0.9706146407925671;0.9680433922436916
Gradient boosting;0.9960583026306724;0.947853559570922;0.9689339684479317;0.9581382406083538
K Neighbors;0.9912192484127269;0.8885966870150181;0.9335665348487572;0.9097783938679028
Gaussian NB;0.5288239599044842;0.5237413611185774;0.7580470941091142;0.3863973248113648

