In [4]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset('./datasets/spam-message-train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'spam_message_ms/'  # where to save trained models



  class                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600, presets='best_quality')
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "spam_message_ms/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 1
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['ham', 'spam']
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = spam, class 0 = ham
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (spam) vs negative (ham) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:  

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      LightGBMLarge_BAG_L1      0.982       0.072572  1011.694583                0.072572        1011.694583            1       True         11
1       WeightedEnsemble_L2      0.982       0.073705  1012.196015                0.001133           0.501431            2       True         12
2           CatBoost_BAG_L1      0.980       0.038029     2.526551                0.038029           2.526551            1       True          5
3         LightGBMXT_BAG_L1      0.980       0.072485   277.274959                0.072485         277.274959            1       True          1
4           LightGBM_BAG_L1      0.978       0.069327   308.118771                0.069327         308.118771            1       True          2
5            XGBoost_BAG_L1      0.978       0.071179    67.689072  

In [6]:
# Inference time:
test_data = TabularDataset('./datasets/spam-message-test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



Loaded data from: ./datasets/spam-message-test.csv | Columns = 2 / 2 | Rows = 1572 -> 1572


                                                                                                                                                          Message
0                                                                                                                               K...k...when will you give treat?
1  This is the 2nd time we have tried to contact u. U have won the £400 prize. 2 claim is easy, just call 087104711148 NOW! Only 10p per minute. BT-national-rate
2                                                                                      He's just gonna worry for nothing. And he won't give you money its no use.
3                                                                                                    Did you get any gift? This year i didnt get anything. So bad
4                              somewhere out there beneath the pale moon light someone think in of u some where out there where dreams come true... goodnite &amp


In [7]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


Evaluation: accuracy on test data: 0.9720101781170484
Evaluations on test data:
{
    "accuracy": 0.9720101781170484,
    "balanced_accuracy": 0.9165086175626238,
    "mcc": 0.8766203031349709,
    "f1": 0.8905472636815921,
    "precision": 0.9470899470899471,
    "recall": 0.8403755868544601
}


In [5]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

Accuracy = 83.49%


In [6]:
test_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
9764,30,Private,151868,9th,5,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
9765,32,State-gov,104509,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,25,United-States
9766,22,Private,187592,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,30,United-States
9767,32,Private,49539,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,3674,0,40,United-States
9768,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States
