In [1]:
import utils

import gc

import pandas as pd

from sklearn.model_selection import train_test_split

from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
dataset, name_of_current_data = utils.get_dataset()
dataset.reset_index(drop=True, inplace=True)
collected = gc.collect()
print("Garbage collector: collected %d objects." % (collected))

In [16]:
print(f"*** Total samples in {name_of_current_data}: {len(dataset.index)} ***")
for device_name in sorted(dataset["class"].unique()):
    num_samples = len((dataset[dataset["class"] == device_name]).index)
    print(f"*** Samples for device: {device_name} in {name_of_current_data}: {num_samples} ({num_samples/dataset.shape[0]}%) ***")

# x is the entire dataframe except for the class column
x = dataset.drop(['class'], axis=1)

# y_original is an unaltered list of all values in the class column
y_original = dataset['class'].values.tolist()

# y is a dataframe of only the class column and the values have been converted to numeric representation
y = dataset['class']
counter = 0
y_temp = dataset['class'].tolist()
for unique_value in sorted(y.unique()):
    for index, value in enumerate(y):
        if value == unique_value:
            y_temp[index] = counter
    counter += 1
dataset["class"] = y_temp
y = dataset['class']
labels_numeric = dataset['class'].unique()

x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=.2, stratify=y.values)


names = list(range(x_train.shape[1]))
train_dataset_df = pd.DataFrame(x_train, columns=names)
train_dataset_df.insert(train_dataset_df.shape[1], "class", y_train)

names = list(range(x_test.shape[1]))
test_dataset_df = pd.DataFrame(x_test, columns=names)
test_dataset_df.insert(test_dataset_df.shape[1], "class", y_test)

del x, y, y_original, y_temp, labels_numeric, x_train, y_train, x_test, y_test, dataset, names
collected = gc.collect()
print("Garbage collector: collected %d objects." % (collected))

print("*** Dataset Loaded ***")

*** Total samples in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 322434 ***
*** Samples for device: cam-1 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 40774 (0.12645688730096702%) ***
*** Samples for device: cam-2 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 40506 (0.12562570944751483%) ***
*** Samples for device: cam-3 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 40396 (0.12528455435841132%) ***
*** Samples for device: cam-4 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 39520 (0.12256771928518705%) ***
*** Samples for device: cam-5 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 40618 (0.12597306735642022%) ***
*** Samples for device: cam-6 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 40446 (0.1254396248534584%) ***
*** Samples for device: cam-7 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 40299 (0.12498371759802006%) ***
*** Samples for device: cam-8 in SimHash-cam-accum_1024-win_5-combo_4-cleaned: 39875 (0.12366871980002109%) ***
Garbage collector: collecte

In [26]:
model_save_path=f"agModels-{name_of_current_data}"

train_dataset_td = TabularDataset(train_dataset_df)
label = "class"
print("Summary of class variable: \n", train_dataset_td[label].describe())

Summary of class variable: 
 count    257947.000000
mean          3.490570
std           2.292715
min           0.000000
25%           1.000000
50%           4.000000
75%           5.000000
max           7.000000
Name: class, dtype: float64


In [18]:
excluded_model_types = ['NN_TORCH', "FASTAI"]
predictor = TabularPredictor(label="class", path=model_save_path).fit(train_dataset_td, presets="best_quality", num_gpus=1, excluded_model_types=excluded_model_types)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "agModels-predictDevice/"
AutoGluon Version:  0.6.2
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #64-Ubuntu SMP Thu Jan 5 11:43:13 UTC 2023
Train Data Rows:    257947
Train Data Columns: 128
Label Column: class
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	8 unique label values:  [0, 3, 1, 6, 4, 7, 2, 5]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You m

In [19]:
results = predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.615878    3906.586901  12426.258812                0.016966          20.355004            3       True         19
1           LightGBM_BAG_L2   0.606303    1034.203860   4551.722049              984.593742        4337.725071            2       True         11
2      LightGBMLarge_BAG_L2   0.604981    1136.267698   4063.576134             1086.657580        3849.579156            2       True         18
3         LightGBMXT_BAG_L2   0.595917    1835.318613   4218.599582             1785.708495        4004.602603            2       True         10
4            XGBoost_BAG_L2   0.525972     418.834451   8151.637550              369.224333        7937.640571            2       True         17
5     ExtraTreesEntr_BAG_L2   0.335604      63.484051    236.7



In [28]:
predictor = TabularPredictor.load(model_save_path)

In [29]:
test_dataset_td = TabularDataset(test_dataset_df)
y_test = test_dataset_td[label]
test_data_noLabel = test_dataset_td.drop(columns=[label])

In [30]:
y_pred = predictor.predict(test_data_noLabel)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.325910648657869
Evaluations on test data:
{
    "accuracy": 0.325910648657869,
    "balanced_accuracy": 0.32590317986445605,
    "mcc": 0.22960156408034707
}


In [None]:
leaderboard_df = predictor.leaderboard(test_dataset_td, silent=True)
leaderboard_df.to_csv(f"autogluon_leaderboard_{name_of_current_data}.csv")

In [32]:
feature_importance_df = predictor.feature_importance(test_dataset_td)
feature_importance_df["p_value"].mean()
feature_importance_df["p_value"]

These features in provided data are not utilized by the predictor and will be ignored: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]
Computing feature importance via permutation shuffling for 0 features using 5000 rows with 5 shuffle sets...


KeyError: "128 required columns are missing from the provided dataset to transform using AutoMLPipelineFeatureGenerator. Missing columns: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127']"