In [30]:
import random
import os
import pickle
import time
import gc


import pandas as pd
from pandas import read_csv

from sklearn.neural_network import MLPClassifier
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import (
    StratifiedKFold,
    train_test_split,
    GridSearchCV,
    cross_validate,
)

from statistics import mean
import umap
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# plt.ioff()
# %matplotlib inline

import numpy as np

from tqdm import tqdm

# import wandb
# wandb.init(project="smart_attacker_same_lightbulb", entity="unr-mpl")

In [31]:
def remove_class(class_name, dataset):
    dataset = dataset[dataset["class"] != class_name]
    return dataset

In [32]:
path_to_iot_noise_cleaned = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/noise/iot_noise/iot_noise_hashes_cleaned.csv"
path_to_iot_noise_uncleaned = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/noise/iot_noise/iot_noise_hashes_uncleaned.csv"

path_to_network_noise_cleaned = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/noise/network_noise/network_noise_hashes_cleaned.csv"
path_to_network_noise_uncleaned = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/noise/network_noise/network_noise_hashes_uncleaned.csv"

path_to_per_packet_cleaned_devices = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/per_packet_hashes/cleaned/per-packet-hashes-cleaned.csv"
path_to_per_packet_uncleaned_devices = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/per_packet_hashes/uncleaned/per-packet-hashes-uncleaned.csv"

path_to_per_packet_cleaned_categories = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/per_packet_hashes/cleaned/cleaned-categories.csv"
path_to_per_packet_uncleaned_categories = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/per_packet_hashes/uncleaned/uncleaned-categories.csv"

path_to_same_plug_cleaned_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_plug/same_plug_cleaned_interaction/"
path_to_same_plug_cleaned_no_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_plug/same_plug_cleaned_no_interaction/"
path_to_same_plug_uncleaned_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_plug/same_plug_uncleaned_interaction/"
path_to_same_plug_uncleaned_no_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_plug/same_plug_uncleaned_no_interaction/"

path_to_same_bulb_cleaned_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_lightbulb/same_lightbulb_cleaned_interaction/"
path_to_same_bulb_cleaned_no_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_lightbulb/same_lightbulb_cleaned_no_interaction/"
path_to_same_bulb_uncleaned_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_lightbulb/same_lightbulb_uncleaned_interaction/"
path_to_same_bulb_uncleaned_no_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_lightbulb/same_lightbulb_uncleaned_no_interaction/"

path_to_same_cam_cleaned_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_cam/same_cam_cleaned_interaction/"
path_to_same_cam_cleaned_no_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_cam/same_cam_cleaned_no_interaction/"
path_to_same_cam_uncleaned_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_cam/same_cam_uncleaned_interaction/"
path_to_same_cam_uncleaned_no_interaction = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/same_device/same_cam/same_cam_uncleaned_no_interaction/"

path_to_simhash = "/home/nthom/Documents/SmartRecon/Fingerprinting in Noisy Network Environments/data/simhashes/"

In [33]:
def noise_generator(original_df, names, one_hundred_times=False):
    num_samples_to_generate = len(original_df.index)
    if one_hundred_times == True:
        num_samples_to_generate *= 100
    num_digits_in_hash = 32
    label = "other"
    random_list = []

    for i in tqdm(range(num_samples_to_generate), desc="Generating random noise"):
        temp_random_hash = []
        for j in range(num_digits_in_hash):
            temp_random_hash.append(random.randint(0, 255))
        temp_random_hash.append(label)
        random_list.append(temp_random_hash)

    output_list = np.concatenate(
        (np.array(random_list), np.array(original_df.values.tolist())), axis=0
    ).tolist()
    output_df = pd.DataFrame(output_list, columns=names)
    return output_df

In [34]:
def combine_csv(csv_list, names):
    final_df = pd.DataFrame(columns=names)
    for index, csv in enumerate(csv_list):
        temp_df = pd.read_csv(csv, names=names)
        final_df = pd.concat([final_df, temp_df])

    return final_df


def get_dataset():
    names = [
        "dim1",
        "dim2",
        "dim3",
        "dim4",
        "dim5",
        "dim6",
        "dim7",
        "dim8",
        "dim9",
        "dim10",
        "dim11",
        "dim12",
        "dim13",
        "dim14",
        "dim15",
        "dim16",
        "dim17",
        "dim18",
        "dim19",
        "dim20",
        "dim21",
        "dim22",
        "dim23",
        "dim24",
        "dim25",
        "dim26",
        "dim27",
        "dim28",
        "dim29",
        "dim30",
        "dim31",
        "dim32",
        "class",
    ]

    experiment_type = int(
        input(
            "Select one of the following: \n1. Nilsimsa Per-Packet Devices \n2. Nilsimsa Per-Packet Categories \n3. Nilsimsa Identical Devices \n4. SimHash Identical Devices \n5. 100x Noise"
        )
    )
    c_uc = int(input("Select one of the following: \n1. Cleaned \n2. Uncleaned"))

    if experiment_type == 1 and c_uc == 1:
        noise = int(
            input(
                "Select one of the following: \n1. Random \n2. IoT Cleaned \n3. IoT Uncleaned \n4. Network Cleaned \n5. Network Uncleaned \n6. None"
            )
        )

        if noise != 6:

            if noise == 1:
                dataset = read_csv(path_to_per_packet_cleaned_devices, names=names)
                dataset = noise_generator(dataset, names)
                name = "cleaned_devices-random"
            elif noise == 2:
                csv_list = [
                    path_to_per_packet_cleaned_devices,
                    path_to_iot_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_devices-cleaned_iot"
            elif noise == 3:
                csv_list = [
                    path_to_per_packet_cleaned_devices,
                    path_to_iot_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_devices-uncleaned_iot"
            elif noise == 4:
                csv_list = [
                    path_to_per_packet_cleaned_devices,
                    path_to_network_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_devices-cleaned_network"
            elif noise == 5:
                csv_list = [
                    path_to_per_packet_cleaned_devices,
                    path_to_network_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_devices-uncleaned_network"
        else:
            dataset = read_csv(path_to_per_packet_cleaned_devices, names=names)
            name = "cleaned_devices"
    elif experiment_type == 1 and c_uc == 2:
        noise = int(
            input(
                "Select one of the following: \n1. Random \n2. IoT Cleaned \n3. IoT Uncleaned \n4. Network Cleaned \n5. Network Uncleaned \n6. None"
            )
        )

        if noise != 6:

            if noise == 1:
                dataset = read_csv(path_to_per_packet_uncleaned_devices, names=names)
                dataset = noise_generator(dataset, names)
                name = "uncleaned_devices-random"
            elif noise == 2:
                csv_list = [
                    path_to_per_packet_uncleaned_devices,
                    path_to_iot_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_devices-cleaned_iot"
            elif noise == 3:
                csv_list = [
                    path_to_per_packet_uncleaned_devices,
                    path_to_iot_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_devices-uncleaned_iot"
            elif noise == 4:
                csv_list = [
                    path_to_per_packet_uncleaned_devices,
                    path_to_network_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_devices-cleaned_network"
            elif noise == 5:
                csv_list = [
                    path_to_per_packet_uncleaned_devices,
                    path_to_network_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_devices-uncleaned-network"
        else:
            dataset = read_csv(path_to_per_packet_uncleaned_devices, names=names)
            name = "uncleaned_devices"
    elif experiment_type == 2 and c_uc == 1:
        noise = int(
            input(
                "Select one of the following: \n1. Random \n2. IoT Cleaned \n3. IoT Uncleaned \n4. Network Cleaned \n5. Network Uncleaned \n6. None"
            )
        )

        if noise != 6:

            if noise == 1:
                dataset = read_csv(path_to_per_packet_cleaned_categories, names=names)
                dataset = noise_generator(dataset, names)
                name = "cleaned_categories-random"
            elif noise == 2:
                csv_list = [
                    path_to_per_packet_cleaned_categories,
                    path_to_iot_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_categories-cleaned_iot"
            elif noise == 3:
                csv_list = [
                    path_to_per_packet_cleaned_categories,
                    path_to_iot_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_categories-uncleaned_iot"
            elif noise == 4:
                csv_list = [
                    path_to_per_packet_cleaned_categories,
                    path_to_network_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_categories-cleaned_network"
            elif noise == 5:
                csv_list = [
                    path_to_per_packet_cleaned_categories,
                    path_to_network_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "cleaned_categories-uncleaned_network"
        else:
            dataset = read_csv(path_to_per_packet_cleaned_categories, names=names)
            name = "cleaned_categories"
    elif experiment_type == 2 and c_uc == 2:
        noise = int(
            input(
                "Select one of the following: \n1. Random \n2. IoT Cleaned \n3. IoT Uncleaned \n4. Network Cleaned \n5. Network Uncleaned \n6. None"
            )
        )

        if noise != 6:

            if noise == 1:
                dataset = read_csv(path_to_per_packet_uncleaned_categories, names=names)
                dataset = noise_generator(dataset, names)
                name = "uncleaned_categories-random"
            elif noise == 2:
                csv_list = [
                    path_to_per_packet_uncleaned_categories,
                    path_to_iot_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_categories-cleaned_iot"
            elif noise == 3:
                csv_list = [
                    path_to_per_packet_uncleaned_categories,
                    path_to_iot_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_categories-uncleaned_iot"
            elif noise == 4:
                csv_list = [
                    path_to_per_packet_uncleaned_categories,
                    path_to_network_noise_cleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_categories-cleaned_network"
            elif noise == 5:
                csv_list = [
                    path_to_per_packet_uncleaned_categories,
                    path_to_network_noise_uncleaned,
                ]
                dataset = combine_csv(csv_list, names)
                name = "uncleaned_categories-uncleaned_network"
        else:
            dataset = read_csv(path_to_per_packet_uncleaned_categories, names=names)
            name = "uncleaned_categories"
    elif experiment_type == 3:
        device = int(input("Select one of the following: \n1. Plug \n2. Bulb \n3. Cam"))
        i_ni = int(
            input("Select one of the following: \n1. Interaction \n2. No Interaction")
        )

        if device == 1 and c_uc == 1 and i_ni == 1:
            csv_list = os.listdir(path_to_same_plug_cleaned_interaction)
            csv_list = [
                f"{path_to_same_plug_cleaned_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "plug-cleaned-interaction"
        elif device == 1 and c_uc == 1 and i_ni == 2:
            csv_list = os.listdir(path_to_same_plug_cleaned_no_interaction)
            csv_list = [
                f"{path_to_same_plug_cleaned_no_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "plug-cleaned-no_interaction"
        elif device == 1 and c_uc == 2 and i_ni == 1:
            csv_list = os.listdir(path_to_same_plug_uncleaned_interaction)
            csv_list = [
                f"{path_to_same_plug_uncleaned_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "plug-uncleaned-interaction"
        elif device == 1 and c_uc == 2 and i_ni == 2:
            csv_list = os.listdir(path_to_same_plug_uncleaned_no_interaction)
            csv_list = [
                f"{path_to_same_plug_uncleaned_no_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "plug-uncleaned-no_interaction"
        elif device == 2 and c_uc == 1 and i_ni == 1:
            csv_list = os.listdir(path_to_same_bulb_cleaned_interaction)
            csv_list = [
                f"{path_to_same_bulb_cleaned_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "bulb-cleaned-interaction"
        elif device == 2 and c_uc == 1 and i_ni == 2:
            csv_list = os.listdir(path_to_same_bulb_cleaned_no_interaction)
            csv_list = [
                f"{path_to_same_bulb_cleaned_no_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "bulb-cleaned-no_interaction"
        elif device == 2 and c_uc == 2 and i_ni == 1:
            csv_list = os.listdir(path_to_same_bulb_uncleaned_interaction)
            csv_list = [
                f"{path_to_same_bulb_uncleaned_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "bulb-uncleaned-interaction"
        elif device == 2 and c_uc == 2 and i_ni == 2:
            csv_list = os.listdir(path_to_same_bulb_uncleaned_no_interaction)
            csv_list = [
                f"{path_to_same_bulb_uncleaned_no_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "bulb-uncleaned-no_interaction"
        elif device == 3 and c_uc == 1 and i_ni == 1:
            csv_list = os.listdir(path_to_same_cam_cleaned_interaction)
            csv_list = [f"{path_to_same_cam_cleaned_interaction + i}" for i in csv_list]
            dataset = combine_csv(csv_list, names)
            name = "cam-cleaned-interaction"
        elif device == 3 and c_uc == 1 and i_ni == 2:
            csv_list = os.listdir(path_to_same_cam_cleaned_no_interaction)
            csv_list = [
                f"{path_to_same_cam_cleaned_no_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "cam-cleaned-no_interaction"
        elif device == 3 and c_uc == 2 and i_ni == 1:
            csv_list = os.listdir(path_to_same_cam_uncleaned_interaction)
            csv_list = [
                f"{path_to_same_cam_uncleaned_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "cam-uncleaned-interaction"
        elif device == 3 and c_uc == 2 and i_ni == 2:
            csv_list = os.listdir(path_to_same_cam_uncleaned_no_interaction)
            csv_list = [
                f"{path_to_same_cam_uncleaned_no_interaction + i}" for i in csv_list
            ]
            dataset = combine_csv(csv_list, names)
            name = "cam-uncleaned-no_interaction"

    elif experiment_type == 4:

        accum = int(
            input(
                "Select one of the following accumulator sizes: \n128 \n256 \n512 \n1024"
            )
        )

        if accum == 128:
            names = [
                "dim1",
                "dim2",
                "dim3",
                "dim4",
                "dim5",
                "dim6",
                "dim7",
                "dim8",
                "dim9",
                "dim10",
                "dim11",
                "dim12",
                "dim13",
                "dim14",
                "dim15",
                "dim16",
                "class",
            ]
        elif accum == 256:
            names = [
                "dim1",
                "dim2",
                "dim3",
                "dim4",
                "dim5",
                "dim6",
                "dim7",
                "dim8",
                "dim9",
                "dim10",
                "dim11",
                "dim12",
                "dim13",
                "dim14",
                "dim15",
                "dim16",
                "dim17",
                "dim18",
                "dim19",
                "dim20",
                "dim21",
                "dim22",
                "dim23",
                "dim24",
                "dim25",
                "dim26",
                "dim27",
                "dim28",
                "dim29",
                "dim30",
                "dim31",
                "dim32",
                "class",
            ]
        elif accum == 512:
            names = [
                "dim1",
                "dim2",
                "dim3",
                "dim4",
                "dim5",
                "dim6",
                "dim7",
                "dim8",
                "dim9",
                "dim10",
                "dim11",
                "dim12",
                "dim13",
                "dim14",
                "dim15",
                "dim16",
                "dim17",
                "dim18",
                "dim19",
                "dim20",
                "dim21",
                "dim22",
                "dim23",
                "dim24",
                "dim25",
                "dim26",
                "dim27",
                "dim28",
                "dim29",
                "dim30",
                "dim31",
                "dim32",
                "dim33",
                "dim34",
                "dim35",
                "dim36",
                "dim37",
                "dim38",
                "dim39",
                "dim40",
                "dim41",
                "dim42",
                "dim43",
                "dim44",
                "dim45",
                "dim46",
                "dim47",
                "dim48",
                "dim49",
                "dim50",
                "dim51",
                "dim52",
                "dim53",
                "dim54",
                "dim55",
                "dim56",
                "dim57",
                "dim58",
                "dim59",
                "dim60",
                "dim61",
                "dim62",
                "dim63",
                "dim64",
                "class",
            ]
        if accum == 1024:
            names = [
                "dim1",
                "dim2",
                "dim3",
                "dim4",
                "dim5",
                "dim6",
                "dim7",
                "dim8",
                "dim9",
                "dim10",
                "dim11",
                "dim12",
                "dim13",
                "dim14",
                "dim15",
                "dim16",
                "dim17",
                "dim18",
                "dim19",
                "dim20",
                "dim21",
                "dim22",
                "dim23",
                "dim24",
                "dim25",
                "dim26",
                "dim27",
                "dim28",
                "dim29",
                "dim30",
                "dim31",
                "dim32",
                "dim33",
                "dim34",
                "dim35",
                "dim36",
                "dim37",
                "dim38",
                "dim39",
                "dim40",
                "dim41",
                "dim42",
                "dim43",
                "dim44",
                "dim45",
                "dim46",
                "dim47",
                "dim48",
                "dim49",
                "dim50",
                "dim51",
                "dim52",
                "dim53",
                "dim54",
                "dim55",
                "dim56",
                "dim57",
                "dim58",
                "dim59",
                "dim60",
                "dim61",
                "dim62",
                "dim63",
                "dim64",
                "dim65",
                "dim66",
                "dim67",
                "dim68",
                "dim69",
                "dim70",
                "dim71",
                "dim72",
                "dim73",
                "dim74",
                "dim75",
                "dim76",
                "dim77",
                "dim78",
                "dim79",
                "dim80",
                "dim81",
                "dim82",
                "dim83",
                "dim84",
                "dim85",
                "dim86",
                "dim87",
                "dim88",
                "dim89",
                "dim90",
                "dim91",
                "dim92",
                "dim93",
                "dim94",
                "dim95",
                "dim96",
                "dim97",
                "dim98",
                "dim99",
                "dim100",
                "dim101",
                "dim102",
                "dim103",
                "dim104",
                "dim105",
                "dim106",
                "dim107",
                "dim108",
                "dim109",
                "dim110",
                "dim111",
                "dim112",
                "dim113",
                "dim114",
                "dim115",
                "dim116",
                "dim117",
                "dim118",
                "dim119",
                "dim120",
                "dim121",
                "dim122",
                "dim123",
                "dim124",
                "dim125",
                "dim126",
                "dim127",
                "dim128",
                "class",
            ]

        window = int(input("Select one of the following window sizes: \n4 \n5 \n6"))
        if window == 4:
            combo = int(
                input("Select one of the following combination sizes: \n2 \n3 \n4")
            )
        elif window == 5:
            combo = int(
                input("Select one of the following combination sizes: \n2 \n3 \n4 \n5")
            )
        elif window == 6:
            combo = int(
                input(
                    "Select one of the following combination sizes: \n2 \n3 \n4 \n5 \n6"
                )
            )

        if c_uc == 1:
            target_dir = f"{path_to_simhash}{accum}/win_{window}/comb_{combo}/cleaned/"
            csv_list = os.listdir(target_dir)
            csv_list = [f"{target_dir + i}" for i in csv_list]
            name = f"SimHash-{accum}-win_{window}-combo_{combo}-cleaned"
        elif c_uc == 2:
            target_dir = (
                f"{path_to_simhash}{accum}/win_{window}/comb_{combo}/uncleaned/"
            )
            csv_list = os.listdir(target_dir)
            csv_list = [f"{target_dir + i}" for i in csv_list]
            name = f"SimHash-{accum}-win_{window}-combo_{combo}-uncleaned"

        dataset = combine_csv(csv_list, names)

    elif experiment_type == 5:
        hash_alg = int(
            input(
                "Select on of the following hashing algorithms: \n1. Nilsimsa \n2.FlexHash"
            )
        )
        noise = int(
            input(
                "Select one of the following: \n1. Random \n2. IoT Cleaned \n3. IoT Uncleaned \n4. Network Cleaned \n5. Network Uncleaned"
            )
        )
        if hash_alg == 1:
            device = int(
                input("Select one of the following: \n1. Plug \n2. Bulb \n3. Cam")
            )
            i_ni = int(
                input(
                    "Select one of the following: \n1. Interaction \n2. No Interaction"
                )
            )
            device_num = int(input("Select a device number (1-8): "))

            if device == 1 and c_uc == 1 and i_ni == 1:
                csv_list = sorted(os.listdir(path_to_same_plug_cleaned_interaction))
                csv_list = [
                    path_to_same_plug_cleaned_interaction + csv_list[device_num - 1]
                ]

                name = f"plug-{device_num}-cleaned-interaction_100x"
            elif device == 1 and c_uc == 1 and i_ni == 2:
                csv_list = sorted(os.listdir(path_to_same_plug_cleaned_no_interaction))
                csv_list = [
                    path_to_same_plug_cleaned_no_interaction + csv_list[device_num - 1]
                ]

                name = f"plug-{device_num}-cleaned-no_interaction_100x"
            elif device == 1 and c_uc == 2 and i_ni == 1:
                csv_list = sorted(os.listdir(path_to_same_plug_uncleaned_interaction))
                csv_list = [
                    path_to_same_plug_uncleaned_interaction + csv_list[device_num - 1]
                ]

                name = f"plug-{device_num}-uncleaned-interaction_100x"
            elif device == 1 and c_uc == 2 and i_ni == 2:
                csv_list = sorted(
                    os.listdir(path_to_same_plug_uncleaned_no_interaction)
                )
                csv_list = [
                    path_to_same_plug_uncleaned_no_interaction
                    + csv_list[device_num - 1]
                ]

                name = f"plug-{device_num}-uncleaned-no_interaction_100x"
            elif device == 2 and c_uc == 1 and i_ni == 1:
                csv_list = sorted(os.listdir(path_to_same_bulb_cleaned_interaction))
                csv_list = [
                    path_to_same_bulb_cleaned_interaction + csv_list[device_num - 1]
                ]

                name = f"bulb-{device_num}-cleaned-interaction_100x"
            elif device == 2 and c_uc == 1 and i_ni == 2:
                csv_list = sorted(os.listdir(path_to_same_bulb_cleaned_no_interaction))
                csv_list = [
                    path_to_same_bulb_cleaned_no_interaction + csv_list[device_num - 1]
                ]

                name = f"bulb-{device_num}-cleaned-no_interaction_100x"
            elif device == 2 and c_uc == 2 and i_ni == 1:
                csv_list = sorted(os.listdir(path_to_same_bulb_uncleaned_interaction))
                csv_list = [
                    path_to_same_bulb_uncleaned_interaction + csv_list[device_num - 1]
                ]

                name = f"bulb-{device_num}-uncleaned-interaction_100x"
            elif device == 2 and c_uc == 2 and i_ni == 2:
                csv_list = sorted(
                    os.listdir(path_to_same_bulb_uncleaned_no_interaction)
                )
                csv_list = [
                    path_to_same_bulb_uncleaned_no_interaction
                    + csv_list[device_num - 1]
                ]

                name = f"bulb-{device_num}-uncleaned-no_interaction_100x"
            elif device == 3 and c_uc == 1 and i_ni == 1:
                csv_list = sorted(os.listdir(path_to_same_cam_cleaned_interaction))
                csv_list = [
                    path_to_same_cam_cleaned_interaction + csv_list[device_num - 1]
                ]

                name = f"cam-{device_num}-cleaned-interaction_100x"
            elif device == 3 and c_uc == 1 and i_ni == 2:
                csv_list = sorted(os.listdir(path_to_same_cam_cleaned_no_interaction))
                csv_list = [
                    path_to_same_cam_cleaned_no_interaction + csv_list[device_num - 1]
                ]

                name = f"cam-{device_num}-cleaned-no_interaction_100x"
            elif device == 3 and c_uc == 2 and i_ni == 1:
                csv_list = sorted(os.listdir(path_to_same_cam_uncleaned_interaction))
                csv_list = [
                    path_to_same_cam_uncleaned_interaction + csv_list[device_num - 1]
                ]

                name = f"cam-{device_num}-uncleaned-interaction_100x"
            elif device == 3 and c_uc == 2 and i_ni == 2:
                csv_list = sorted(os.listdir(path_to_same_cam_uncleaned_no_interaction))
                csv_list = [
                    path_to_same_cam_uncleaned_no_interaction + csv_list[device_num - 1]
                ]

                name = f"cam-{device_num}-uncleaned-no_interaction_100x"

            if noise == 2:
                csv_list.append(path_to_iot_noise_cleaned)
                name += "-cleaned_iot"
            elif noise == 3:
                csv_list.append(path_to_iot_noise_uncleaned)
                name += "-uncleaned_iot"
            elif noise == 4:
                csv_list.append(path_to_network_noise_cleaned)
                name += "-cleaned_network"
            elif noise == 5:
                csv_list.append(path_to_network_noise_uncleaned)
                name += "-uncleaned_network"

            dataset = combine_csv(csv_list, names)

            if noise == 1:
                dataset = noise_generator(dataset, names, False)
                name += "-random"

            print(csv_list, name)

    return dataset, name


dataset, name_of_current_data = get_dataset()
collected = gc.collect()
print("Garbage collector: collected %d objects." % (collected))

Garbage collector: collected 0 objects.


In [35]:
print(f"*** Total samples in {name_of_current_data}: {len(dataset.index)} ***")
for device_name in sorted(dataset["class"].unique()):
    num_samples = len((dataset[dataset["class"] == device_name]).index)
    print(
        f"*** Samples for device: {device_name} in {name_of_current_data}: {num_samples} ({num_samples/dataset.shape[0]}%) ***"
    )

# classes_to_remove = ["light-4", "light-5", "light-6", "light-7", "light-8",]
# for item in classes_to_remove:
#     dataset = remove_class(item, dataset)
#     dataset.dropna(inplace=True)

# Uncomment this line to take only a portion of the data
# dataset = dataset.head(len(dataset.index)//10)

# x is the entire dataframe except for the class column
x = dataset.drop(["class"], axis=1)

# y_original is an unaltered list of all values in the class column
y_original = dataset["class"].values.tolist()

# y is a dataframe of only the class column and the values have been converted to numeric representation
y = dataset["class"]
counter = 0
y_temp = dataset["class"].tolist()
for unique_value in sorted(y.unique()):
    for index, value in enumerate(y):
        if value == unique_value:
            y_temp[index] = counter
    counter += 1
dataset["class"] = y_temp
y = dataset["class"]
labels_numeric = dataset["class"].unique()

# x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=.2, stratify=y.values)

del dataset
# del x
# del y
del y_original
del y_temp
del labels_numeric
collected = gc.collect()
print("Garbage collector: collected %d objects." % (collected))

print("*** Dataset Loaded ***")

*** Total samples in cam-cleaned-no_interaction: 322434 ***
*** Samples for device: cam-1 in cam-cleaned-no_interaction: 40774 (0.12645688730096702%) ***
*** Samples for device: cam-2 in cam-cleaned-no_interaction: 40506 (0.12562570944751483%) ***
*** Samples for device: cam-3 in cam-cleaned-no_interaction: 40396 (0.12528455435841132%) ***
*** Samples for device: cam-4 in cam-cleaned-no_interaction: 39520 (0.12256771928518705%) ***
*** Samples for device: cam-5 in cam-cleaned-no_interaction: 40618 (0.12597306735642022%) ***
*** Samples for device: cam-6 in cam-cleaned-no_interaction: 40446 (0.1254396248534584%) ***
*** Samples for device: cam-7 in cam-cleaned-no_interaction: 40299 (0.12498371759802006%) ***
*** Samples for device: cam-8 in cam-cleaned-no_interaction: 39875 (0.12366871980002109%) ***
Garbage collector: collected 0 objects.
*** Dataset Loaded ***


In [36]:
# param_grid_HGBC = {"learning_rate": [0.01, 0.001, .1], "max_leaf_nodes": [None, 31, 50, 100], "max_depth": [None, 8, 16, 32, 64, 128], "min_samples_leaf": [5, 20, 100], "l2_regularization": [0, .1, .5, 1]}
# HGBC = ensemble.HistGradientBoostingClassifier()
# clf_HGBC = GridSearchCV(HGBC, param_grid_HGBC, n_jobs=20).fit(x.values, y.values)

In [37]:
# param_grid_RFC = {"n_estimators": [50, 100, 200, 500, 1000], "min_samples_leaf": [1, 5, 10, 20, 50, 100], "min_samples_split": [1, 2, 5, 10, 20, 50, 100], "l2_regularization": [.1, .3, .5, .7, 1], "max_depth": [5, 10, 30, 50, 100, 200]}
# RFC = ensemble.RandomForestClassifier()
# clf_RFC = GridSearchCV(RFC, param_grid_RFC, n_jobs=20).fit(x.values, y.values)

In [38]:
# Spot Check Algorithms
# x = [1000 for i in range(100)]
# x = (* x,)

models = []
# models.append((1, ensemble.BaggingClassifier(base_estimator=ensemble.RandomForestClassifier(max_depth=10), n_estimators=50, bootstrap_features=True, n_jobs=16)))
# models.append((1, ensemble.AdaBoostClassifier(base_estimator=ensemble.RandomForestClassifier(), n_estimators=50)))
# models.append((2, ensemble.AdaBoostClassifier(base_estimator=ensemble.RandomForestClassifier(max_depth=10), n_estimators=50)))
# models.append((2, MLPClassifier()))
models.append((1, ensemble.HistGradientBoostingClassifier()))
# models.append((2, ensemble.HistGradientBoostingClassifier(max_depth=32)))
# models.append((3, ensemble.HistGradientBoostingClassifier(max_depth=128)))
# models.append((2, ensemble.RandomForestClassifier()))
# models.append((3, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(), n_estimators=50)))
# models.append((4, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=8), n_estimators=50)))
# models.append((5, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=16), n_estimators=50)))
# models.append((6, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=24), n_estimators=50)))
# models.append((7, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=32), n_estimators=50)))
# models.append((8, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(), n_estimators=100)))
# models.append((9, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=8), n_estimators=100)))
# models.append((10, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=16), n_estimators=100)))
# models.append((11, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=24), n_estimators=100)))
# models.append((12, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=32), n_estimators=100)))
# models.append((4, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=128), n_estimators=50)))
# models.append((4, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=8), n_estimators=50)))
# models.append((5, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=32), n_estimators=50)))
# models.append((4, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=1), n_estimators=32)))
# models.append((4, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=1), n_estimators=500)))
# models.append((5, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=1), n_estimators=1000)))
# models.append((6, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(max_depth=1), n_estimators=10000)))
# models.append((4, ensemble.AdaBoostClassifier(base_estimator=ensemble.HistGradientBoostingClassifier(l2_regularization=0.2, learning_rate=0.2, min_samples_leaf=100), n_estimators=50)))
# models.append((1, ensemble.GradientBoostingClassifier(max_depth=10)))
# models.append((1, ensemble.GradientBoostingClassifier(max_depth=100)))

# evaluate each model
for model_name, model in models:
    print(f"*** Begin Training and Evaluating {model_name} ***")
    start_time = time.time()
    # print(y_train.shape)
    # model.fit(x_train, y_train)
    # print(f"*** {model_name} Trained ***")

    # y_pred = model.predict(x_test)
    # y_probas = model.predict_proba(x_test)

    # weighted_acc_dict = {}
    # for index, label in enumerate(y_test):
    #     weighted_acc_dict[f"{label}_count"] += 1
    #     if y_pred[index] == label:
    #         weighted_acc_dict[label] += 1

    # total_accuracy = accuracy_score(y_test, y_pred)
    # total_f1 = f1_score(y_test, y_pred, average='weighted')

    # print(f"Accuracy: {total_accuracy}")
    # print(f"F1: {total_f1}")

    # ******************** #
    # Cross Validation
    # ******************** #
    cross_val_results = cross_validate(
        model,
        x.values,
        y.values,
        cv=7,
        scoring=["accuracy", "balanced_accuracy", "f1_weighted"],
        n_jobs=7,
    )
    print(f"*** Finished Training and Evaluating {model_name} ***")
    print(f"Dataset Name: {name_of_current_data}")
    print(f"Runtime: {time.time() - start_time}")
    print(f"Accuracy: {mean(cross_val_results['test_accuracy'])}")
    print(f"Balanced Accuracy: {mean(cross_val_results['test_balanced_accuracy'])}")
    # print(f"F1: {mean(cross_val_results['test_f1'])}")
    print(f"Weighted F1: {mean(cross_val_results['test_f1_weighted'])}")

    # wandb.log({f"Total accuracy TSR on {name_of_current_data}": total_accuracy,
    #            "Dataset": name_of_current_data,
    #            "Num Samples": dataset.shape[0]})
    # wandb.log({f"Total precision TSR on {name_of_current_data}": total_precision,
    #            "Dataset": name_of_current_data,
    #            "Num Samples": dataset.shape[0]})
    # wandb.log({f"Total recall TSR on {name_of_current_data}": total_recall,
    #            "Dataset": name_of_current_data,
    #            "Num Samples": dataset.shape[0]})
    # wandb.log({f"Total f1 TSR on {name_of_current_data}": total_f1,
    #            "Dataset": name_of_current_data,
    #            "Num Samples": dataset.shape[0]})

*** Begin Training and Evaluating 1 ***
*** Finished Training and Evaluating 1 ***
Dataset Name: cam-cleaned-no_interaction
Runtime: 4.64546799659729
Accuracy: 0.12176445412084333
Balanced Accuracy: 0.12128887669066196
Weighted F1: 0.11282157223050782


In [39]:
# def draw_umap(data, n_neighbors, min_dist, n_components, metric, title, save_path):
#     umap_reducer = umap.UMAP(
#         n_neighbors=n_neighbors,
#         min_dist=min_dist,
#         n_components=n_components,
#         metric=metric
#     )
#
#     umap_embedding = umap_reducer.fit_transform(data)
#
#     fig = plt.figure(figsize=(5, 5))
#     if n_components == 1:
#         umap_df = pd.DataFrame(umap_embedding, columns=["dim1"])
#         umap_df["class"] = y_train
#
#         ax = fig.add_subplot(111)
#         ax.scatter(umap_df["dim1"].values, range(len(umap_df.index)), c=umap_df["class"].values, s=1)
#     elif n_components == 2:
#         umap_df = pd.DataFrame(umap_embedding, columns=["dim1", "dim2"])
#         umap_df["class"] = y_train
#
#         ax = fig.add_subplot(111)
#         ax.scatter(umap_df["dim1"].values, umap_df["dim2"].values, c=umap_df["class"].values, s=1)
#     else:
#         umap_df = pd.DataFrame(umap_embedding, columns=["dim1", "dim2", "dim3"])
#         umap_df["class"] = y_train
#         ax = fig.add_subplot(111, projection='3d')
#         ax.scatter(umap_df["dim1"].values, umap_df["dim2"].values,umap_df["dim3"].values, c=umap_df["class"].values, s=1)
#
#     plt.title(title, fontsize=8)
#
#     plt.savefig(save_path, dpi=1200)

In [40]:
# # n_neighbors adjusts the UMAP's attention to local structure vs. global relationships
# # min_dist adjusts how close umap is allowed to place points together
# if not os.path.isdir(f"../figures/{name_of_current_data}/"):
#     os.mkdir(f"../figures/{name_of_current_data}/")
#
# num_generations = 2
# for i in tqdm(range(3)):
#     for j in range(num_generations):
#         n_neighbors = 15
#         min_dist = 0.1
#         n_components = i+1
#         metric = "euclidean"
#         # metric = "minkowski"
#
#         title = f"{name_of_current_data}_{n_neighbors}_{min_dist}_{n_components}_{metric}"
#         save_path = f"../figures/{name_of_current_data}/{n_components}d_{j+1}.png"
#         # save_path = f"/home/nthom/Documents/nilsimsa_vis/{n_components}d_{j+1}.png"
#         draw_umap(x_train, n_neighbors, min_dist, n_components, metric, title, save_path)