In [3]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset('../../modified/abalone-train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'abalone_models/'  # where to save trained models



Loaded data from: ../../modified/abalone-train.csv | Columns = 9 / 9 | Rows = 3340 -> 3340


   0      1      2      3       4       5       6       7  class
0  M  0.650  0.525  0.185  1.4880  0.6650  0.3370  0.3780     11
1  M  0.595  0.475  0.170  1.0965  0.4190  0.2290  0.3500     17
2  M  0.575  0.470  0.140  0.8375  0.3485  0.1735  0.2400     11
3  F  0.550  0.450  0.150  0.8750  0.3620  0.1755  0.2765     10
4  M  0.505  0.385  0.145  0.6775  0.2360  0.1790  0.2000     15


In [2]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "abalone_models/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 8
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	First 10 (of 20) unique label values:  [11, 17, 10, 15, 8, 13, 6, 5, 12, 9]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 6 examples that will be kept for training models: 0.982
Train Data Class Count: 15
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    526079.31 MB
	Train Data (Original)  Memory Usage: 0.06 MB (0.0% of available memory)
	Inferring data type of each feature based on column valu

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.353535       0.719679   13.348423                0.000476           0.407246            2       True         14
1            LightGBMXT   0.343434       0.006627    1.235220                0.006627           1.235220            1       True          4
2         LightGBMLarge   0.313131       0.009436    6.291453                0.009436           6.291453            1       True         13
3        NeuralNetMXNet   0.313131       0.131122    3.977715                0.131122           3.977715            1       True         12
4              CatBoost   0.303030       0.005111    1.562947                0.005111           1.562947            1       True          8
5              LightGBM   0.303030       0.005962    1.411917                0.005962           1.

In [4]:
# Inference time:
test_data = TabularDataset('../../modified/abalone-test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



Loaded data from: ../../modified/abalone-test.csv | Columns = 9 / 9 | Rows = 836 -> 836


   0      1      2      3       4       5       6      7
0  F  0.440  0.355  0.115  0.4150  0.1585  0.0925  0.131
1  M  0.680  0.515  0.170  1.6115  0.8415  0.3060  0.395
2  F  0.475  0.360  0.125  0.4470  0.1695  0.0810  0.140
3  M  0.620  0.480  0.160  1.0765  0.4120  0.2530  0.300
4  M  0.515  0.435  0.145  0.8815  0.2920  0.2060  0.255


In [5]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


Evaluation: accuracy on test data: 0.22966507177033493
Evaluations on test data:
{
    "accuracy": 0.22966507177033493,
    "balanced_accuracy": 0.10508099528629468,
    "mcc": 0.12106042409142431
}


In [7]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

Accuracy = 22.97%
