# Setting Parameters

In [None]:
apply_classes = ['33+1', '8+1', '1+1']

apply_sampling = None   # DO NOT CHANGE

apply_evaluators = ['Perceptron', 'XGBoost', 'KNearestNeighbor', 'RandomForest']


# Checking that inputs are available
for _class in apply_classes:
    if _class not in ['33+1', '8+1', '1+1']:
        assert False, f'{_class} is an invalid class structure.'

if apply_sampling not in [None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'Cluster+SMOTE']:
    assert False, f'{apply_sampling} is an invalid under-sampler.'

for evaluator in apply_evaluators:
    if evaluator not in ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 
                         'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']:
        assert False, f'{evaluator} is an invalid evaluator.'

# Dataset Handling
## Imports

In [None]:
import os
import pandas as pd
import random
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

## Loading the Dataset

In [None]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 5

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

# Preprocessing
## Encoding Labels

In [None]:
full_data['33+1'] = full_data['label'].copy()

label_categories = {
    'Backdoor_Malware': 'Web',
    'BenignTraffic': 'Benign',
    'BrowserHijacking': 'Web',
    'CommandInjection': 'DDoS',
    'DDoS-ACK_Fragmentation': 'DDoS',
    'DDoS-HTTP_Flood': 'DDoS',
    'DDoS-ICMP_Flood': 'DDoS',
    'DDoS-ICMP_Fragmentation': 'DDoS',
    'DDoS-PSHACK_Flood': 'DDoS',
    'DDoS-RSTFINFlood': 'DDoS',
    'DDoS-SYN_Flood': 'DDoS',
    'DDoS-SlowLoris': 'DDoS',
    'DDoS-SynonymousIP_Flood': 'DDoS',
    'DDoS-TCP_Flood': 'DDoS',
    'DDoS-UDP_Flood': 'DDoS',
    'DDoS-UDP_Fragmentation': 'DDoS',
    'DNS_Spoofing': 'Spoofing',
    'DictionaryBruteForce': 'BruteForce',
    'DoS-HTTP_Flood': 'DoS',
    'DoS-SYN_Flood': 'DoS',
    'DoS-TCP_Flood': 'DoS',
    'DoS-UDP_Flood': 'DoS',
    'MITM-ArpSpoofing': 'Spoofing',
    'Mirai-greeth_flood': 'Mirai',
    'Mirai-greip_flood': 'Mirai',
    'Mirai-udpplain': 'Mirai',
    'Recon-HostDiscovery': 'Recon',
    'Recon-OSScan': 'Recon',
    'Recon-PingSweep': 'Recon',
    'Recon-PortScan': 'Recon',
    'SqlInjection': 'Web',
    'Uploading_Attack': 'Web',
    'VulnerabilityScan': 'Recon',
    'XSS': 'Web'
}
full_data['8+1'] = full_data['33+1'].map(label_categories)

full_data.loc[full_data['label'] != 'BenignTraffic', '1+1'] = 'Attack'
full_data.loc[full_data['label'] == 'BenignTraffic', '1+1'] = 'Benign'

full_label_encoder = LabelEncoder()
class_label_encoder = LabelEncoder()
binary_label_encoder = LabelEncoder()

full_data['33+1'] = full_label_encoder.fit_transform(full_data['33+1'])
full_data['8+1'] = class_label_encoder.fit_transform(full_data['8+1'])
full_data['1+1'] = binary_label_encoder.fit_transform(full_data['1+1'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(full_label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: full_label_encoder.transform([label])[0] for label in full_label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['33+1'] == code].iloc[0])

full_data.head()

## 

In [None]:
X = full_data.drop(['label', '33+1', '8+1', '1+1'], axis=1)
y = full_data[['label', '33+1', '8+1', '1+1']]

# Sampling

In [None]:
if apply_sampling is not None:
    cat_cols = [
        'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
        'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
        'cwr_flag_number', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
        'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP',
        'ICMP', 'IPv', 'LLC'
    ]
    
    undersampler = None
    oversampler = None
    
    match apply_sampling:
        case 'RandomOverSampler':
            from imblearn.over_sampling import RandomOverSampler
            oversampler = RandomOverSampler(random_state=42)
        case 'RandomUnderSampler':
            from imblearn.under_sampling import RandomUnderSampler
            undersampler = RandomUnderSampler(random_state=42)
        case 'SMOTENC':
            from imblearn.over_sampling import SMOTENC
            oversampler = SMOTENC(categorical_features=cat_cols, random_state=42)
        case 'Clustering+SMOTENC':
            from imblearn.under_sampling import ClusterCentroids
            from imblearn.over_sampling import SMOTENC
            undersampler = ClusterCentroids(random_state=42)
            oversampler = SMOTENC(categorical_features=cat_cols, random_state=42)

#   Resampling does not yet include 33+1, 8+1, 1+1 classes
#
#    if undersampler is not None:
#        X, y = undersampler.fit_resample(X, y)  
#    if oversampler is not None:
#        X, y = oversampler.fit_resample(X, y)
        
else:
    print('No sampling selected.')

In [None]:
# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.concat([X, y], axis=1)

print(full_data_resampled.head())
print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['33+1'] == code].iloc[0])

full_data_resampled.head()

## Real vs Resampled Dataset Analysis

In [None]:
full_data_resampled.describe()

In [None]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(full_data, title='Original Data', minimal=True)
resampled_report = ProfileReport(full_data_resampled, title='Resampled Data', minimal=True)
comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('./profile_reports/smote_original_vs_resampled.html')

# Evaluator Model

## Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Preprocessing
### Scaling Numerical Features

In [None]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scaler = StandardScaler()
full_data_resampled[num_cols] = scaler.fit_transform(full_data_resampled[num_cols])

### Splitting

In [None]:
X = full_data_resampled.drop(['label', '33+1', '8+1', '1+1'], axis=1)
y_all = full_data_resampled[['label', '33+1', '8+1', '1+1']]

X_train, X_test, y_train_all, y_test_all = train_test_split(X, y_all, test_size=0.2, random_state=42)

y_train = {}
y_test = {}
for _class in apply_classes:
    y_train[_class] = y_train_all[_class]
    y_test[_class] = y_test_all[_class]

print(f'X_train: {X_train.shape}, y_train: {y_train['33+1'].shape}, X_test: {X_test.shape}, y_test: {y_test['33+1'].shape}')

## Training

In [None]:
for evaluator_type in apply_evaluators:
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator = XGBClassifier()
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator = LogisticRegression(random_state=42, n_jobs=-1)
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator = Perceptron(random_state=42, n_jobs=-1)
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator = AdaBoostClassifier(random_state=42, algorithm='SAMME')
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator = RandomForestClassifier(random_state=42, n_jobs=-1)
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator = MLPClassifier(random_state=42)
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator = KNeighborsClassifier(n_jobs=-1)
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    for _class in apply_classes:
        print(f'{datetime.now()} : Training {evaluator_type} on {_class} classes')
        evaluator.fit(X_train, y_train[_class])
    
        print(f'{datetime.now()} : Predicting {evaluator_type} on {_class} classes')
        y_pred = evaluator.predict(X_test)
    
        print(f'{evaluator_type} {_class} Metrics')
        print(f'   Accuracy: {accuracy_score(y_test[_class], y_pred)}')
        print(f'   Precision: {precision_score(y_test[_class], y_pred, average='weighted', zero_division=0.0)}')
        print(f'   Recall: {recall_score(y_test[_class], y_pred, average='weighted')}')
        print(f'   F1: {f1_score(y_test[_class], y_pred, average='weighted')}')
        print()

## Model Analysis

In [None]:
from sklearn.metrics import confusion_matrix

cm = pd.DataFrame(confusion_matrix(y_test, y_pred), columns = full_label_encoder.classes_)
cm.insert(0, column='Actual', value=full_label_encoder.classes_)
cm