Naive Bayes classification, with some code based on the tutorial from: 
https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/

In [1]:
# Imports
from math import sqrt, exp, pi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
plt.style.use('bmh')

In [2]:
# Check if data exists, download if required
rt_iot2022 = fetch_ucirepo(id=942) 

In [3]:
# Remove axes that don't contribute to data, and get labels
features = rt_iot2022.data.features
targets = rt_iot2022.data.targets

unqi = features.nunique(axis=0)
for i in range(len(unqi)):
    if unqi[i] <= 1:
        print("Removing", features.axes[1][i])
        del features[features.axes[1][i]]
# bwd_URG_flag count is always the same, usually removed

# Get labels for data (attack or normal behavior)
normal_patterns = ["MQTT_Publish", "Thing_speak", "Wipro_bulb_Dataset", "Amazon-Alexa"]
y_bool = [int(x in normal_patterns) for x in targets.values]

  if unqi[i] <= 1:


Removing bwd_URG_flag_count


In [4]:
# TODO: convert string to numerical
unqi_service = list(features["service"].unique())
unqi_proto = list(features["proto"].unique())
for i in range(len(features["service"])):
    features.loc[i, "service"] = unqi_service.index(features["service"][i])
    features.loc[i, "proto"] = unqi_proto.index(features["proto"][i])

In [5]:
# Split data into train/test sets
X = features.values
y = np.array(y_bool)
np.random.seed(2)

test_split = .2
test_samples = int(len(X) * test_split)
perm  = np.random.permutation(len(X))
X = X[perm]
y = y[perm]

x_train = X[0:test_samples]
y_train = y[0:test_samples]
x_test  = X[test_samples:]
y_test  = y[test_samples:]

In [6]:
# Seperate data by class
def seperate_by_class(data, labels):
    seperated = dict()
    for i in range(len(data)):
        vector = data[i]
        if labels[i] not in seperated:
            seperated[labels[i]] = list()    
        seperated[labels[i]].append(vector)
    return seperated

seperated_train = seperate_by_class(x_train, y_train)
seperated_test = seperate_by_class(x_test, y_test)

In [7]:
# Statistical measures
def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stddev(numbers):
    variance = sum([(x - mean(numbers))**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

def summarize(dataset):
    summaries = []
    for column in zip(*dataset):
        # changed to numpy versions for efficency reasons (50 mins vs a few seconds)
        column = np.array(column)
        summaries.append([np.mean(column), np.std(column), len(column)])
    return summaries

# pass in the seperated dataset
def summarize_classwise(dataset):
    summaries = dict()
    for classy, rows in dataset.items():
        summaries[classy] = summarize(rows)
    return summaries

In [8]:
summarized_test = summarize_classwise(seperated_test)
summarized_train = summarize_classwise(seperated_train)

In [9]:
summarized_test
# todo: should [2] be the same for all?

{0: [[34193.458573695214, 19203.166148484223, 95169],
  [994.8489739305867, 5381.612003344824, 95169],
  [0.1077556767434774, 0.3116939568037514, 95169],
  [1.2929525370656412, 0.8249750528796502, 95169],
  [2.352617815318013, 125.0806059937397, 95169],
  [1.9824732843678088, 19.522533784390102, 95169],
  [1.7147285355525435, 17.96540781953476, 95169],
  [1.4014227321922055, 16.674791318158224, 95169],
  [0.6599102648971829, 16.381965799349516, 95169],
  [363902.8617889904, 371206.6218498899, 95169],
  [363855.6541742278, 371246.6662217275, 95169],
  [727758.5159661613, 742450.1676473862, 95169],
  [0.8629848611837888, 0.3395385191843569, 95169],
  [44.23587512740493, 369.609160083848, 95169],
  [19.345837405037354, 4.932958963914801, 95169],
  [19.96511469070811, 6.3809281090450405, 95169],
  [39.357984217549834, 506.50392511198703, 95169],
  [17.188832497977284, 7.679884706412769, 95169],
  [17.667769967111138, 8.652786833787406, 95169],
  [0.11884121930460549, 0.4799230277827581, 95

In [10]:
# gaussian prob dist calculation
# TODO: why is std 0 sometimes
def calc_prob(x, mean, std):
    if (std != 0):
        exponential = exp(-((x-mean)**2 / (2 * std**2)))
        return (1 / (sqrt(2 * pi) * std)) * exponential
    else:
        return 1

def calc_class_prob(summaries, row):
    probs = dict()
    total_rows = sum([summaries[label][0][2] for label in summaries])
    for i, val in summaries.items():
        probs[i] = summaries[i][0][2] / float(total_rows)
        for j in range(len(val)):
            mean, std, _ = val[j]
            probs[i] *= calc_prob(row[j], mean, std)
    return probs


In [11]:
probs = calc_class_prob(summarized_train, x_train[15])

predicted = []
for i in x_test:
    probs = calc_class_prob(summarized_train, i)
    if probs[0] > probs[1]:
        predicted.append(0)
    else:
        predicted.append(1)


In [12]:
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

for i in range(len(predicted)):
    if predicted[i] == 1:
        if predicted[i] == y_test[i]:
            true_positive += 1
        else:
            false_positive += 1
    elif predicted[i] == 0:
        if predicted[i] == y_test[i]:
            true_negative += 1
        else:
            false_negative += 1

pos_acc = true_positive / (true_positive + false_positive)
neg_acc = true_negative / (true_negative + false_negative)

print("Naive bayes accuracy is:", (true_positive + true_negative)/len(predicted))
print("Balanced Acc is: ", (pos_acc+neg_acc)/2)

Naive bayes accuracy is: 0.9147359230003859
Balanced Acc is:  0.6416698115145643
