In [1]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset('../../modified/airline-passengers-train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'airline_passengers_ms/'  # where to save trained models



         0  class
0  1950-07    170
1  1951-01    145
2  1950-03    141
3  1957-03    356
4  1952-05    183


In [2]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600, presets='best_quality')
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "airline_passengers_ms/"
AutoGluon Version:  0.2.0
Train Data Rows:    115
Train Data Columns: 1
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (559, 104, 265.86087, 114.02069)
	If 'regression' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    518466.9 MB
	Train Data (Original)  Memory Usage: 0.01 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGen

[1000]	train_set's rmse: 38.0432	valid_set's rmse: 54.7317
[2000]	train_set's rmse: 37.3611	valid_set's rmse: 54.4316


	-281.2742	 = Validation root_mean_squared_error score
	87.39s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 3512.51s of the 3512.51s of remaining time.


[1000]	train_set's rmse: 35.3204	valid_set's rmse: 53.7288
[2000]	train_set's rmse: 34.9314	valid_set's rmse: 53.2433
[3000]	train_set's rmse: 34.8081	valid_set's rmse: 53.0319
[4000]	train_set's rmse: 34.7684	valid_set's rmse: 52.9668
[5000]	train_set's rmse: 34.7547	valid_set's rmse: 52.9367
[6000]	train_set's rmse: 34.7498	valid_set's rmse: 52.9131
[7000]	train_set's rmse: 34.7481	valid_set's rmse: 52.8995
[8000]	train_set's rmse: 34.7475	valid_set's rmse: 52.8917
[9000]	train_set's rmse: 34.7473	valid_set's rmse: 52.8866
[10000]	train_set's rmse: 34.7472	valid_set's rmse: 52.8838


	-52.8838	 = Validation root_mean_squared_error score
	597.78s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 2914.39s of the 2914.39s of remaining time.
	-29.9668	 = Validation root_mean_squared_error score
	0.88s	 = Training runtime
	0.14s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 2913.36s of the 2913.36s of remaining time.
	-28.546	 = Validation root_mean_squared_error score
	0.38s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 2912.97s of the 2912.96s of remaining time.
	-32.1744	 = Validation root_mean_squared_error score
	0.93s	 = Training runtime
	0.15s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 2911.88s of the 2911.87s of remaining time.
	-285.6674	 = Validation root_mean_squared_error score
	5.18s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ... Training model

[1000]	train_set's rmse: 24.6863	valid_set's rmse: 39.2697
[2000]	train_set's rmse: 24.6356	valid_set's rmse: 39.1745


	-39.1741	 = Validation root_mean_squared_error score
	555.23s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 2296.93s of remaining time.
	-27.775	 = Validation root_mean_squared_error score
	0.43s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1303.53s ...
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("airline_passengers_ms/")


*** Summary of fit() ***
Estimated performance of each model:
                 model   score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  WeightedEnsemble_L2  -27.774987       0.263192    8.049107                0.000599           0.432174            2       True         10
1             CatBoost  -28.545953       0.005719    0.384898                0.005719           0.384898            1       True          4
2      RandomForestMSE  -29.966824       0.138103    0.875215                0.138103           0.875215            1       True          3
3              XGBoost  -30.152825       0.019234   47.371173                0.019234          47.371173            1       True          7
4        ExtraTreesMSE  -32.174355       0.148063    0.926887                0.148063           0.926887            1       True          5
5        LightGBMLarge  -39.174076       0.012766  555.234303                0.012766         555.

In [3]:
# Inference time:
test_data = TabularDataset('../../modified/airline-passengers-test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



Loaded data from: ../../modified/airline-passengers-test.csv | Columns = 2 / 2 | Rows = 29 -> 29


         0
0  1953-01
1  1956-01
2  1958-08
3  1955-07
4  1957-07


In [4]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


Evaluation: root_mean_squared_error on test data: -50.145914936318725
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -50.145914936318725,
    "mean_squared_error": -2514.612784800513,
    "mean_absolute_error": -38.782122907967405,
    "r2": 0.8404053495103291,
    "pearsonr": 0.9388929239610186,
    "median_absolute_error": -28.656829833984375
}


In [5]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

KeyError: 'accuracy'

In [None]:
airline-passengers