In [1]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset('../../modified/auto_imports-train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'auto_imports_ms/'  # where to save trained models



    0        1       2             3   4                   5  \
0  38  Private  181705     Assoc-voc  11  Married-civ-spouse   
1  17  Private  121425          11th   7       Never-married   
2  25  Private  460322     Bachelors  13       Never-married   
3  25  Private  161007       HS-grad   9       Never-married   
4  35  Private  204163  Some-college  10            Divorced   

                   6          7      8       9  10  11  12             13  \
0    Exec-managerial    Husband  White    Male   0   0  40  United-States   
1       Adm-clerical  Own-child  White  Female   0   0  16  United-States   
2      Other-service  Own-child  White    Male   0   0  43  United-States   
3  Machine-op-inspct  Own-child  White    Male   0   0  40  United-States   
4  Machine-op-inspct  Unmarried  Black  Female   0   0  55  United-States   

   class  
0  <=50K  
1  <=50K  
2  <=50K  
3  <=50K  
4  <=50K  


In [2]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "adult_ms/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 14
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['<=50K', '>50K']
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = >50K, class 0 = <=50K
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (>50K) vs negative (<=50K) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    520855.93 MB
	Trai

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         LightGBMLarge       0.89       0.026200  267.580235                0.026200         267.580235            1       True         13
1   WeightedEnsemble_L2       0.89       0.026847  268.036947                0.000647           0.456712            2       True         14
2               XGBoost       0.86       0.020559   27.883711                0.020559          27.883711            1       True         11
3              LightGBM       0.86       0.044000   65.081026                0.044000          65.081026            1       True          4
4              CatBoost       0.85       0.011787    1.142910                0.011787           1.142910            1       True          7
5            LightGBMXT       0.83       0.030439   29.710629                0.030439          29.

In [3]:
# Inference time:
test_data = TabularDataset('../../modified/auto_imports-test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



Loaded data from: ../../modified/adult-all-test.csv | Columns = 15 / 15 | Rows = 9769 -> 9769


    0            1       2             3   4                   5  \
0  42      Private  350550  Some-college  10  Married-civ-spouse   
1  24      Private  163053          11th   7       Never-married   
2  49  Federal-gov   61885     Bachelors  13  Married-civ-spouse   
3  21      Private  391312       HS-grad   9       Never-married   
4  43      Private  133584  Some-college  10  Married-civ-spouse   

                   6              7      8       9  10  11  12             13  
0  Machine-op-inspct        Husband  White    Male   0   0  45  United-States  
1              Sales  Not-in-family  White  Female   0   0  36  United-States  
2    Exec-managerial        Husband  White    Male   0   0  45  United-States  
3      Other-service  Not-in-family  Black  Female   0   0  30  United-States  
4  Machine-op-inspct        Husband  White    Male   0   0  40  United-States  


In [4]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


Evaluation: accuracy on test data: 0.8177909714402702
Evaluations on test data:
{
    "accuracy": 0.8177909714402702,
    "balanced_accuracy": 0.7089894016101054,
    "mcc": 0.46550692313097847,
    "f1": 0.5700483091787439,
    "precision": 0.667420814479638,
    "recall": 0.4974704890387858
}


In [5]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

Accuracy = 81.78%


In [None]:
airline-passengers