In [1]:
import gc
from socket import gethostname

import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

import utils

In [2]:
dataset, name_of_current_data = utils.get_dataset()
dataset.reset_index(drop=True, inplace=True)
collected = gc.collect()
print("Garbage collector: collected %d objects." % (collected))

Garbage collector: collected 0 objects.


In [3]:
print(f"*** Total samples in {name_of_current_data}: {len(dataset.index)} ***")
for device_name in sorted(dataset["class"].unique()):
    num_samples = len((dataset[dataset["class"] == device_name]).index)
    print(f"*** Samples for device: {device_name} in {name_of_current_data}: {num_samples} ({num_samples/dataset.shape[0]}%) ***")

# x is the entire dataframe except for the class column
x = dataset.drop(['class'], axis=1)

# y_original is an unaltered list of all values in the class column
y_original = dataset['class'].values.tolist()

# y is a dataframe of only the class column and the values have been converted to numeric representation
y = dataset['class']
counter = 0
y_temp = dataset['class'].tolist()
for unique_value in sorted(y.unique()):
    for index, value in enumerate(y):
        if value == unique_value:
            y_temp[index] = counter
    counter += 1
dataset["class"] = y_temp
y = dataset['class']
labels_numeric = dataset['class'].unique()

x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=.8, stratify=y.values)


names = list(range(x_train.shape[1]))
train_dataset_df = pd.DataFrame(x_train, columns=names)
train_dataset_df.insert(train_dataset_df.shape[1], "class", y_train)

names = list(range(x_test.shape[1]))
test_dataset_df = pd.DataFrame(x_test, columns=names)
test_dataset_df.insert(test_dataset_df.shape[1], "class", y_test)

del x, y, y_original, y_temp, labels_numeric, x_train, y_train, x_test, y_test, dataset, names
collected = gc.collect()
print("Garbage collector: collected %d objects." % (collected))

print("*** Dataset Loaded ***")

*** Total samples in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 152009 ***
*** Samples for device: plug-1 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 16415 (0.10798702708392266%) ***
*** Samples for device: plug-2 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 15192 (0.09994145083514792%) ***
*** Samples for device: plug-3 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 17811 (0.11717069384049629%) ***
*** Samples for device: plug-4 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 21993 (0.14468222276312587%) ***
*** Samples for device: plug-5 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 19964 (0.1313343288884211%) ***
*** Samples for device: plug-6 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 14779 (0.09722450644369741%) ***
*** Samples for device: plug-7 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 30078 (0.19786986296864%) ***
*** Samples for device: plug-8 in FlexHash-plug-accum_1024-win_5-combo_4-cleaned: 15777 (0.10378990717654876%) ***
Garb

In [4]:
model_save_path=f"agModels-{name_of_current_data}_{gethostname()}"

train_dataset_td = TabularDataset(train_dataset_df)
label = "class"
print("Summary of class variable: \n", train_dataset_td[label].describe())

Summary of class variable: 
 count    30401.000000
mean         3.693497
std          2.229232
min          0.000000
25%          2.000000
50%          4.000000
75%          6.000000
max          7.000000
Name: class, dtype: float64


In [None]:
excluded_model_types = ['NN_TORCH', "FASTAI"]
predictor = TabularPredictor(eval_metric="f1_micro", label="class", path=model_save_path).fit(train_dataset_td, presets="medium_quality", excluded_model_types=excluded_model_types)

Presets specified: ['medium_quality']
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "agModels-FlexHash-plug-accum_1024-win_5-combo_4-cleaned-3_1/"
AutoGluon Version:  0.7.0
Python Version:     3.10.9
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #33~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 30 17:03:34 UTC 2
Train Data Rows:    30401
Train Data Columns: 128
Label Column: class
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	8 unique label values:  [0, 3, 2, 6, 7, 4, 1, 5]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one 

In [None]:
results = predictor.fit_summary(show_plot=True)

In [None]:
predictor = TabularPredictor.load(model_save_path)

In [None]:
test_dataset_td = TabularDataset(test_dataset_df)
y_test = test_dataset_td[label]
test_data_noLabel = test_dataset_td.drop(columns=[label])

In [None]:
y_pred = predictor.predict(test_data_noLabel)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

In [None]:
leaderboard_df = predictor.leaderboard(test_dataset_td, silent=True)
leaderboard_df.to_csv(f"autogluon_leaderboard_{name_of_current_data}_{gethostname()}.csv")

In [None]:
feature_importance_df = predictor.feature_importance(test_dataset_td)
feature_importance_df["p_value"].mean()
feature_importance_df["p_value"]