In [1]:
import gc
from statistics import mean
import itertools

import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn import ensemble
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate

%matplotlib inline

from utils import get_dataset_parameterized_auto_select

In [2]:
device_input = int(input("Select one of the following: \n1. Plug \n2. Bulb \n3. Cam"))
if not device_input in [1, 2, 3]:
    raise ValueError(
        "'device' parameter must be one of the following values: 1, 2 or 3. 1 represents plugs, 2 represents light bulbs and 3 represents cameras."
    )
if device_input == 1:
    device_selection = "plug"
elif device_input == 2:
    device_selection = "light"
else:
    device_selection = "cam"

device_c_uc_input = int(
    input("Select one of the following: \n1. Cleaned \n 2. Uncleaned \n")
)
if not device_c_uc_input in [1, 2]:
    raise ValueError("'c_uc' selection must be one of the following values: 1 or 2.")
if device_c_uc_input == 1:
    device_c_uc_selection = "cleaned"
else:
    device_c_uc_selection = "uncleaned"

noise_input = int(
    input(
        "Select one of the following: \n1. IOT \n 2. Network \n3. Random \n4. None \n"
    )
)
if not noise_input in [1, 2, 3, 4]:
    raise ValueError(
        "'noise_type' selection must be one of the following values: 1, 2, 3 or 4."
    )
if noise_input == 1:
    noise_selection = "iot"
elif noise_input == 2:
    noise_selection = "network"
elif noise_input == 3:
    noise_selection = "random"
else:
    noise_selection = "none"

noise_c_uc_input = int(
    input("Select one of the following: \n1. Cleaned \n 2. Uncleaned \n")
)
if not noise_c_uc_input in [1, 2]:
    raise ValueError("'c_uc' selection must be one of the following values: 1 or 2.")
if noise_c_uc_input == 1:
    noise_c_uc_selection = "cleaned"
else:
    noise_c_uc_selection = "uncleaned"

csv_output_name = f"autogluon_parameter_select_{device_selection}_{device_c_uc_input}_{noise_selection}_{noise_c_uc_selection}.csv"
performance_df = pd.DataFrame(
    columns=["dataset name", "accum", "window", "combo", "acc", "bal_acc", "f1"]
)

device_params = list(itertools.repeat(device_input, 48))
device_c_uc_params = list(itertools.repeat(device_c_uc_input, 48))
noise_params = list(itertools.repeat(noise_input, 48))
noise_c_uc_params = list(itertools.repeat(noise_c_uc_input, 48))
accum_params = (
    list(itertools.repeat(128, 6))
    + list(itertools.repeat(256, 6))
    + list(itertools.repeat(512, 6))
    + list(itertools.repeat(1024, 6))
)
window_params = (
    list(itertools.repeat(4, 3))
    + list(itertools.repeat(5, 4))
    + list(itertools.repeat(6, 5))
    + list(itertools.repeat(4, 3))
    + list(itertools.repeat(5, 4))
    + list(itertools.repeat(6, 5))
    + list(itertools.repeat(4, 3))
    + list(itertools.repeat(5, 4))
    + list(itertools.repeat(6, 5))
    + list(itertools.repeat(4, 3))
    + list(itertools.repeat(5, 4))
    + list(itertools.repeat(6, 5))
)
combo_params = [
    2,
    3,
    4,
    2,
    3,
    4,
    5,
    2,
    3,
    4,
    5,
    6,
    2,
    3,
    4,
    2,
    3,
    4,
    5,
    2,
    3,
    4,
    5,
    6,
    2,
    3,
    4,
    2,
    3,
    4,
    5,
    2,
    3,
    4,
    5,
    6,
    2,
    3,
    4,
    2,
    3,
    4,
    5,
    2,
    3,
    4,
    5,
    6,
]

dataset_params = zip(
    device_params,
    device_c_uc_params,
    noise_params,
    noise_c_uc_params,
    accum_params,
    window_params,
    combo_params,
)

dataset_dict = {}
if noise_selection == "none":
    total = 24
else:
    total = 48
for device, device_c_uc, noise, noise_c_uc, accum, window, combo in tqdm(
    dataset_params, total=total
):
    # dataset_dict[f"{device}_{device_c_uc}_{noise}_{noise_c_uc}_{accum}_{window}_{combo}"] = get_dataset_parameterized_auto_select(device, device_c_uc, noise, noise_c_uc, accum, window, combo)
    temp_dataset, temp_name = get_dataset_parameterized_auto_select(
        device, device_c_uc, noise, noise_c_uc, accum, window, combo
    )

    # x is the entire dataframe except for the class column
    x = temp_dataset.drop(["class"], axis=1)
    # y_original is an unaltered list of all values in the class column
    y_original = temp_dataset["class"].values.tolist()
    # y is a dataframe of only the class column and the values have been converted to numeric representation
    y = temp_dataset["class"]
    counter = 0
    y_temp = temp_dataset["class"].tolist()
    for unique_value in sorted(y.unique()):
        for index, value in enumerate(y):
            if value == unique_value:
                y_temp[index] = counter
        counter += 1
    temp_dataset["class"] = y_temp
    y = temp_dataset["class"]
    x_train, x_test, y_train, y_test = train_test_split(
        x.values, y.values, test_size=0.2, stratify=y.values
    )

    dataset_dict[temp_name] = [x_train, x_test, y_train, y_test]

    # print(temp_dataset[temp_dataset["class"]==0])
    # break

    del temp_dataset, y_original, y_temp, x, y
    collected = gc.collect()

100%|██████████| 24/24 [00:16<00:00,  1.46it/s]


In [3]:
max_dict = {0: [0, ""], 1: [0, ""], 2: [0, ""]}

for dataset_name in tqdm(dataset_dict.keys(), total=len(list(dataset_dict.keys()))):
    _, x_test, _, y_test = dataset_dict[dataset_name]
    # print(x_test, y_test)
    # c = collections.Counter(y_test)
    # print(c.items())
    # print(dataset_name)
    # break

    temp_model = ensemble.HistGradientBoostingClassifier()
    temp_cross_val_results = cross_validate(
        temp_model, x_test, y_test, cv=7, scoring=["accuracy"], n_jobs=-1
    )
    temp_average_accuracy = mean(temp_cross_val_results["test_accuracy"])

    if temp_average_accuracy > max_dict[0][0]:
        max_average_accuracy = temp_average_accuracy
        max_average_dataset_name = dataset_name

    if temp_average_accuracy > max_dict[0][0]:
        max_dict[2][0] = max_dict[1][0]
        max_dict[2][1] = max_dict[1][1]

        max_dict[1][0] = max_dict[0][0]
        max_dict[1][1] = max_dict[0][1]

        max_dict[0][0] = temp_average_accuracy
        max_dict[0][1] = dataset_name

print(max_dict)

100%|██████████| 24/24 [02:23<00:00,  5.96s/it]

{0: [0.6120318658946895, 'SimHash-plug_cleaned-none-accum_1024-window_5-combo_5'], 1: [0.5878562563034814, 'SimHash-plug_cleaned-none-accum_512-window_4-combo_4'], 2: [0.5777910932463632, 'SimHash-plug_cleaned-none-accum_512-window_4-combo_3']}





In [4]:
# for key in max_dict.keys():
#     x_train,_,y_train,_ = dataset_dict[max_dict[key][1]]
#     final_model = ensemble.HistGradientBoostingClassifier()
#     cross_val_results = cross_validate(final_model, x_train, y_train, cv=7, scoring=['accuracy'], n_jobs=-1)
#     average_accuracy = mean(cross_val_results["test_accuracy"])
#     print(average_accuracy)

In [11]:
for key in tqdm(max_dict.keys()):
    x_train, _, y_train, _ = dataset_dict[max_dict[key][1]]

    param_model = ensemble.HistGradientBoostingClassifier()
    # param_grid = {"max_depth": [32, 64, 128, None], "l2_regularization": [0, 1, 3, 5, 7,], "min_samples_leaf": [10, 20, 40, 60, 80, 100, 1000], "scoring": ["accuracy"]}
    param_grid = {
        "learning_rate": [0.1, 0.5, 1],
        "max_leaf_nodes": [7, 31, None],
        "max_depth": [1, 32, None],
        "l2_regularization": [0, 0.5, 1, 5],
        "min_samples_leaf": [10, 20, 100],
        "scoring": ["loss", "accuracy"],
    }
    # param_grid = {"n_neighbors": [3, 5, 9, 51, 101, 501], "weights": ["uniform", "distance"], "leaf_size": [10, 30, 50, 100], "p": [1, 2]}
    # search = HalvingGridSearchCV(param_model, param_grid, resource="n_samples", scoring="accuracy", cv=3, aggressive_elimination=True, n_jobs=-1, verbose=1).fit(x_train, y_train)
    search = GridSearchCV(
        estimator=param_model,
        param_grid=param_grid,
        n_jobs=-1,
        cv=3,
        verbose=1,
        scoring="accuracy",
    ).fit(x_train, y_train)
    print("##########")
    print(
        f"{key}. Optimal Parameters: {search.best_params_} Score: {search.best_score_}"
    )
    print("__________")
    print(search.cv_results_)
    print("##########")

  0%|          | 0/3 [00:00<?, ?it/s]

Fitting 3 folds for each of 140 candidates, totalling 420 fits


 33%|███▎      | 1/3 [22:13<44:26, 1333.06s/it]

##########
0. Optimal Parameters: {'l2_regularization': 3, 'max_depth': 64, 'min_samples_leaf': 100, 'scoring': 'accuracy'} Score: 0.6237223130206542
__________
{'mean_fit_time': array([ 51.32644963,  47.56674735,  51.71424445,  51.4629132 ,
        61.87095308,  56.1330208 , 119.57413793,  45.25846918,
        42.81610958,  56.65178299,  48.4144911 ,  57.44508751,
        53.36347103,  98.39146725,  46.72155825,  49.92313329,
        54.41178544,  55.34295495,  55.29589279,  62.50738033,
        80.52113605,  46.41901739,  55.31670117,  52.94281832,
        57.47008657,  53.31559833,  56.39796686,  81.00670012,
        52.51008503,  52.58714795,  64.6497163 ,  54.08738867,
        60.37692769,  46.49261165,  73.13639625,  43.117709  ,
        54.14882509,  57.95182395,  54.93700282,  57.81856108,
        58.48899921, 104.37857064,  53.29617898,  54.50924929,
        53.45428419,  55.90010675,  57.59171653,  53.68988609,
        83.46269457,  58.16214204,  57.9467059 ,  57.04862905,
  

 67%|██████▋   | 2/3 [36:52<17:46, 1066.29s/it]

##########
1. Optimal Parameters: {'l2_regularization': 7, 'max_depth': None, 'min_samples_leaf': 80, 'scoring': 'accuracy'} Score: 0.6075061261047606
__________
{'mean_fit_time': array([31.31134637, 31.81156182, 36.62518811, 41.04349621, 48.09518878,
       36.27365446, 61.71374846, 28.49168372, 30.62460423, 35.64691687,
       36.76408251, 36.08219401, 31.44455099, 56.59500655, 31.76028442,
       28.75799378, 34.83445319, 42.23196514, 40.92249091, 36.3296446 ,
       53.98178744, 27.13944594, 38.7584428 , 31.63577787, 32.3548828 ,
       29.39155753, 43.11211212, 49.06249603, 36.69990547, 38.81168874,
       39.31777231, 29.11222402, 42.01829259, 39.11478639, 83.36371922,
       32.2184302 , 33.23186358, 30.05672685, 37.2593921 , 39.88713082,
       38.04315988, 69.70284828, 32.89666605, 41.73648636, 42.14216145,
       33.61748401, 44.11251799, 36.16017063, 81.19491895, 40.79110567,
       28.93316658, 33.39681633, 36.24720867, 40.40339351, 36.01443497,
       63.76877332, 34.88583

 67%|██████▋   | 2/3 [49:33<24:46, 1486.60s/it]


KeyboardInterrupt: 

In [10]:
x_train, x_test, y_train, y_test = dataset_dict[max_dict[0][1]]
param_model = ensemble.HistGradientBoostingClassifier(
    l2_regularization=5,
    learning_rate=0.1,
    max_depth=32,
    max_leaf_nodes=31,
    min_samples_leaf=20,
    scoring="accuracy",
)
cross_val_results = cross_validate(
    param_model,
    np.concatenate((x_train, x_test)),
    np.concatenate((y_train, y_test)),
    cv=7,
    scoring=["accuracy"],
    n_jobs=-1,
)
average_accuracy = mean(cross_val_results["test_accuracy"])
print(average_accuracy)

0.6247130495151173
