# Setting Parameters

In [35]:
# Classes: '33+1', '8+1', '1+1'
apply_classes = ['33+1', '8+1', '1+1']

# Samplers: None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids'
# Note: SMOTE is O(n^2) to O(n^3). Using a sample size of 1 CSV file takes ~10 min per class to oversample on a OC'd 9700k.
#       Limit the sample size to 1 CSV file unless you want to measure completion time on a geologic timescale.
apply_sampling = 'ClusterCentroids'    # Select ONE from above

# Evaluators: 'XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor'
apply_evaluators = ['Perceptron', 'AdaBoost']


# Notebook parameter validation
for _class in apply_classes:
    if _class not in ['33+1', '8+1', '1+1']:
        assert False, f'{_class} is an invalid class structure.'

if apply_sampling not in [None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids']:
    assert False, f'{apply_sampling} is an invalid sampling method.'
    
for evaluator in apply_evaluators:
    if evaluator not in ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 
                         'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']:
        assert False, f'{evaluator} is an invalid evaluator.'

# Dataset Handling
## Common Imports

In [36]:
import os
import pandas as pd
import random
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from ydata_profiling import ProfileReport
from tqdm import tqdm

## Loading the Dataset

In [37]:
DATASET_DIRECTORY = '../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 5

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=42)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,...,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0,1149323.0
mean,5.639886,76035.52,9.066,66.33707,9227.045,9227.045,3.665947e-06,0.08637259,0.2067147,0.09023573,...,124.8018,33.47234,124.8674,83182510.0,9.498405,13.12644,47.30562,31086.34,0.09647052,141.5101
std,263.4897,455238.6,8.941014,13.97396,101200.8,101200.8,0.002072376,0.2809136,0.4049493,0.2865193,...,241.0931,161.355,241.9796,17018150.0,0.81757,8.636226,228.2203,327347.8,0.2330712,21.02859
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.099143,2.099143,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071560.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.81202,15.81202,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124520.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1053554,305.03,14.64,64.0,118.5152,118.5152,0.0,0.0,0.0,0.0,...,54.05011,0.37285,54.06,83343900.0,9.5,10.39673,0.5059213,1.344812,0.08,141.55
max,59466.46,9810438.0,47.0,255.0,7340032.0,7340032.0,2.059601,1.0,1.0,1.0,...,7959.452,8026.92,5858.0,167639400.0,14.5,123.6001,11351.78,73704170.0,1.0,244.6


# Preprocessing
## Duplicating data for classes

In [38]:
all_data = {}

for _class in apply_classes:
    all_data[_class] = full_data.copy()
    
    match _class:            
        case '8+1':
            label_categories = {
                'Backdoor_Malware': 'Web',
                'BenignTraffic': 'Benign',
                'BrowserHijacking': 'Web',
                'CommandInjection': 'DDoS',
                'DDoS-ACK_Fragmentation': 'DDoS',
                'DDoS-HTTP_Flood': 'DDoS',
                'DDoS-ICMP_Flood': 'DDoS',
                'DDoS-ICMP_Fragmentation': 'DDoS',
                'DDoS-PSHACK_Flood': 'DDoS',
                'DDoS-RSTFINFlood': 'DDoS',
                'DDoS-SYN_Flood': 'DDoS',
                'DDoS-SlowLoris': 'DDoS',
                'DDoS-SynonymousIP_Flood': 'DDoS',
                'DDoS-TCP_Flood': 'DDoS',
                'DDoS-UDP_Flood': 'DDoS',
                'DDoS-UDP_Fragmentation': 'DDoS',
                'DNS_Spoofing': 'Spoofing',
                'DictionaryBruteForce': 'BruteForce',
                'DoS-HTTP_Flood': 'DoS',
                'DoS-SYN_Flood': 'DoS',
                'DoS-TCP_Flood': 'DoS',
                'DoS-UDP_Flood': 'DoS',
                'MITM-ArpSpoofing': 'Spoofing',
                'Mirai-greeth_flood': 'Mirai',
                'Mirai-greip_flood': 'Mirai',
                'Mirai-udpplain': 'Mirai',
                'Recon-HostDiscovery': 'Recon',
                'Recon-OSScan': 'Recon',
                'Recon-PingSweep': 'Recon',
                'Recon-PortScan': 'Recon',
                'SqlInjection': 'Web',
                'Uploading_Attack': 'Web',
                'VulnerabilityScan': 'Recon',
                'XSS': 'Web'
            }
            all_data['8+1']['label'] = all_data['8+1']['label'].map(label_categories)
            
        case '1+1':
            all_data['1+1'].loc[all_data['1+1']['label'] != 'BenignTraffic', 'label'] = 'Attack'
            all_data['1+1'].loc[all_data['1+1']['label'] == 'BenignTraffic', 'label'] = 'Benign'

all_data[apply_classes[0]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
135011,4.223898,123.92,6.11,64.0,2.475843,2.475843,0.0,0.0,1.0,0.0,...,2.549096,55.16,82973000.0,9.5,10.454952,3.613746,72.944002,0.09,141.55,DoS-SYN_Flood
203161,1.081191,88.56,6.0,64.0,1.185178,1.185178,0.0,0.0,1.0,0.0,...,0.0,54.0,83365600.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SynonymousIP_Flood
121081,0.0,54.0,6.0,64.0,24.204573,24.204573,0.0,0.0,0.0,0.0,...,0.0,54.0,83076240.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-TCP_Flood
164680,0.0,54.0,6.0,64.0,16.182906,16.182906,0.0,1.0,0.0,1.0,...,0.0,54.0,83344990.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-RSTFINFlood
72705,152.665709,12591.4,7.6,106.3,0.84505,0.84505,0.0,0.0,0.0,0.0,...,84.022061,126.0,166846900.0,13.5,15.437371,119.114693,7102.458425,1.0,244.6,Recon-HostDiscovery


In [39]:
all_data[apply_classes[1]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
135011,4.223898,123.92,6.11,64.0,2.475843,2.475843,0.0,0.0,1.0,0.0,...,2.549096,55.16,82973000.0,9.5,10.454952,3.613746,72.944002,0.09,141.55,DoS
203161,1.081191,88.56,6.0,64.0,1.185178,1.185178,0.0,0.0,1.0,0.0,...,0.0,54.0,83365600.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS
121081,0.0,54.0,6.0,64.0,24.204573,24.204573,0.0,0.0,0.0,0.0,...,0.0,54.0,83076240.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS
164680,0.0,54.0,6.0,64.0,16.182906,16.182906,0.0,1.0,0.0,1.0,...,0.0,54.0,83344990.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS
72705,152.665709,12591.4,7.6,106.3,0.84505,0.84505,0.0,0.0,0.0,0.0,...,84.022061,126.0,166846900.0,13.5,15.437371,119.114693,7102.458425,1.0,244.6,Recon


In [40]:
all_data[apply_classes[2]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
135011,4.223898,123.92,6.11,64.0,2.475843,2.475843,0.0,0.0,1.0,0.0,...,2.549096,55.16,82973000.0,9.5,10.454952,3.613746,72.944002,0.09,141.55,Attack
203161,1.081191,88.56,6.0,64.0,1.185178,1.185178,0.0,0.0,1.0,0.0,...,0.0,54.0,83365600.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
121081,0.0,54.0,6.0,64.0,24.204573,24.204573,0.0,0.0,0.0,0.0,...,0.0,54.0,83076240.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
164680,0.0,54.0,6.0,64.0,16.182906,16.182906,0.0,1.0,0.0,1.0,...,0.0,54.0,83344990.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
72705,152.665709,12591.4,7.6,106.3,0.84505,0.84505,0.0,0.0,0.0,0.0,...,84.022061,126.0,166846900.0,13.5,15.437371,119.114693,7102.458425,1.0,244.6,Attack



## Encoding Labels

In [41]:
for _class in apply_classes:    
    match _class:
        case '33+1':
            full_label_encoder = LabelEncoder()
            all_data['33+1']['label'] = full_label_encoder.fit_transform(all_data['33+1']['label'])
            
        case '8+1':
            class_label_encoder = LabelEncoder()
            all_data['8+1']['label'] = class_label_encoder.fit_transform(all_data['8+1']['label'])
            
        case '1+1':
            binary_label_encoder = LabelEncoder()
            all_data['1+1']['label'] = binary_label_encoder.fit_transform(all_data['1+1']['label'])
            

# Store label mappings
label_mapping = {index: label for index, label in enumerate(full_label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: full_label_encoder.transform([label])[0] for label in full_label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(all_data[apply_classes[0]][all_data[apply_classes[0]]['label'] == code].iloc[0])

all_data[apply_classes[0]].head()

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration      1.341747e+03
Header_Length      4.718860e+04
Protocol Type      1.480000e+01
Duration           1.000000e+02
Rate

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
135011,4.223898,123.92,6.11,64.0,2.475843,2.475843,0.0,0.0,1.0,0.0,...,2.549096,55.16,82973000.0,9.5,10.454952,3.613746,72.944002,0.09,141.55,19
203161,1.081191,88.56,6.0,64.0,1.185178,1.185178,0.0,0.0,1.0,0.0,...,0.0,54.0,83365600.0,9.5,10.392305,0.0,0.0,0.0,141.55,12
121081,0.0,54.0,6.0,64.0,24.204573,24.204573,0.0,0.0,0.0,0.0,...,0.0,54.0,83076240.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
164680,0.0,54.0,6.0,64.0,16.182906,16.182906,0.0,1.0,0.0,1.0,...,0.0,54.0,83344990.0,9.5,10.392305,0.0,0.0,0.0,141.55,9
72705,152.665709,12591.4,7.6,106.3,0.84505,0.84505,0.0,0.0,0.0,0.0,...,84.022061,126.0,166846900.0,13.5,15.437371,119.114693,7102.458425,1.0,244.6,26


## X, y Splitting

In [42]:
X = {}
y = {}

for _class in apply_classes:
    X[_class] = all_data[_class].drop('label', axis=1)
    y[_class] = all_data[_class]['label']

print(f'X: {X[apply_classes[0]].shape}, y: {y[apply_classes[0]].shape}')

X: (1149323, 46), y: (1149323,)


# Sampling

In [None]:
if apply_sampling is not None:
    
    sampler = None
    
    match apply_sampling:
        case 'RandomOverSampler':
            from imblearn.over_sampling import RandomOverSampler
            sampler = RandomOverSampler(random_state=42)
            
        case 'RandomUnderSampler':
            from imblearn.under_sampling import RandomUnderSampler
            sampler = RandomUnderSampler(random_state=42)
            
        case 'SMOTE':
            from imblearn.over_sampling import SMOTENC
            cat_cols = [
                'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
                'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
                'cwr_flag_number', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
                'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP',
                'ICMP', 'IPv', 'LLC'
            ]
            sampler = SMOTENC(categorical_features=cat_cols, random_state=42, n_jobs=-1)
        case 'ClusterCentroids':
            from imblearn.under_sampling import ClusterCentroids
            sampler = ClusterCentroids(random_state=42)
    
    for _class in apply_classes:
        print(f'{datetime.now()}: Applying undersampling to {_class} label grouping...')
        X[_class], y[_class] = sampler.fit_resample(X[_class], y[_class])
        print(f'{_class} : X: {X[_class].shape}, y: {y[_class].shape}')
    
    print(f'X: {X[apply_classes[0]].shape}, y: {y[apply_classes[0]].shape}')
else:
    print('No sampling selected.')

2024-05-09 19:38:05.249731: Applying undersampling to 33+1 label grouping...




33+1 : X: (1156, 46), y: (1156,)
2024-05-09 19:38:10.049416: Applying undersampling to 8+1 label grouping...




8+1 : X: (1156, 46), y: (1156,)
2024-05-09 19:38:42.132717: Applying undersampling to 1+1 label grouping...


In [None]:
# Recombine the resampled features and labels back
all_data_resampled = {}
for _class in apply_classes:
    all_data_resampled[_class] = pd.concat([X[_class], y[_class]], axis=1)

print("Resampled Data (UNSCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(all_data_resampled[apply_classes[0]][all_data_resampled[apply_classes[0]]['label'] == code].iloc[0])

## Real vs Resampled Dataset Analysis

In [None]:
all_data_resampled[apply_classes[0]].describe()

### Generate Reports

In [None]:
for _class in apply_classes:
    original_report = ProfileReport(all_data[_class], title=f'{_class} Original Data', minimal=True)
    resampled_report = ProfileReport(all_data_resampled[_class], title=f'{_class} Resampled Data', minimal=True)
    comparison_report = original_report.compare(resampled_report)
    comparison_report.to_file(f'./profile_reports/{apply_sampling}_{_class}_resampling_report.html')

# Evaluator Model

## Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Preprocessing
### Scaling Numerical Features

In [None]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scaler = StandardScaler()
for _class in apply_classes:
    all_data_resampled[_class][num_cols] = scaler.fit_transform(all_data_resampled[_class][num_cols])

### X, y Train/Test Splitting

In [None]:
X_train = {}
X_test = {}
y_train = {}
y_test = {}
    
for _class in apply_classes:
    X = all_data_resampled[_class].drop('label', axis=1)
    y = all_data_resampled[_class]['label']
    
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train.update({_class: X_train_temp})
    X_test.update({_class: X_test_temp})
    y_train.update({_class: y_train_temp})
    y_test.update({_class: y_test_temp})
    
print(f'X_train: {X_train[apply_classes[0]].shape}, y_train: {y_train[apply_classes[0]].shape}, X_test: {X_test[apply_classes[0]].shape}, y_test: {y_test[apply_classes[0]].shape}')

## Training

In [None]:
for evaluator_type in apply_evaluators:
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator = XGBClassifier()
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator = LogisticRegression(random_state=42, n_jobs=-1)
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator = Perceptron(random_state=42, n_jobs=-1)
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator = AdaBoostClassifier(random_state=42, algorithm='SAMME')
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator = RandomForestClassifier(random_state=42, n_jobs=-1)
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator = MLPClassifier(random_state=42)
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator = KNeighborsClassifier(n_jobs=-1)
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    
    
    for _class in apply_classes:
        # XGBoost for binary classification must be a binary objective
        if evaluator_type == 'XGBoost' and _class == '1+1':
            evaluator = XGBClassifier(objective='binary:logistic')
            
        print(f'{datetime.now()} : Training {evaluator_type} on {apply_sampling} balanced data with {_class} label classes')
        evaluator.fit(X_train[_class], y_train[_class])
    
        print(f'{datetime.now()} : Predicting {evaluator_type} on {_class} classes')
        y_pred = evaluator.predict(X_test[_class])
    
        print(f'{evaluator_type} {_class} Metrics')
        print(f'   Accuracy: {accuracy_score(y_test[_class], y_pred)}')
        print(f'   Precision: {precision_score(y_test[_class], y_pred, average='weighted', zero_division=0.0)}')
        print(f'   Recall: {recall_score(y_test[_class], y_pred, average='weighted')}')
        print(f'   F1: {f1_score(y_test[_class], y_pred, average='weighted')}')
        print()

## Model Analysis

In [None]:
# Disabled, needs multi-label-group and multi-estimator re-implementation
# 
# from sklearn.metrics import confusion_matrix
# 
# cm = pd.DataFrame(confusion_matrix(y_test, y_pred), columns = full_label_encoder.classes_)
# cm.insert(0, column='Actual', value=full_label_encoder.classes_)
# cm