In [1]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset('./datasets/Womens-clothing-E-CommerceReviews-train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'adult_bsq_ms/'  # where to save trained models



   Clothing ID  Age                    Title  \
0          767   33                      NaN   
1         1080   34                      NaN   
2         1077   60  Some major design flaws   
3         1049   50         My favorite buy!   
4          847   47         Flattering shirt   

                                         Review Text Rating Recommended IND  \
0  Absolutely wonderful - silky and sexy and comf...      4               1   
1  Love this dress!  it's sooo pretty.  i happene...      5               1   
2  I had such high hopes for this dress and reall...      3               0   
3  I love, love, love this jumpsuit. it's fun, fl...      5               1   
4  This shirt is very flattering to all due to th...      5               1   

  Positive Feedback Count   Division Name Department Name      class  
0                       0       Initmates        Intimate  Intimates  
1                       4         General         Dresses    Dresses  
2                      

In [2]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600, presets='best_quality')
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "adult_bsq_ms/"
AutoGluon Version:  0.2.0
Train Data Rows:    500
Train Data Columns: 9
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	First 10 (of 18) unique label values:  ['Intimates', 'Dresses', 'Pants', 'Blouses', 'Knits', 'Outerwear', 'Lounge', 'Sweaters', 'Skirts', 'Fine gauge']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 4 examples that will be kept for training models: 0.982
Train Data Class Count: 13
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    492244.19 MB
	Train Data (Original)  Memory Usage: 0.37 MB (0.0% of 

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.985743       0.364204   415.281328                0.000572           0.528380            2       True         14
1            XGBoost_BAG_L1   0.977597       0.251536   414.747516                0.251536         414.747516            1       True         11
2     KNeighborsDist_BAG_L1   0.969450       0.112095     0.005432                0.112095           0.005432            1       True          2
3           LightGBM_BAG_L1   0.963340       0.248578   207.083251                0.248578         207.083251            1       True          5
4     KNeighborsUnif_BAG_L1   0.961303       0.114821     0.006247                0.114821           0.006247            1       True          1
5           CatBoost_BAG_L1   0.961303       0.399945    86.722846  

In [4]:
# Inference time:
test_data = TabularDataset('./datasets/Womens-clothing-E-CommerceReviews-test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



Loaded data from: ./datasets/Womens-clothing-E-CommerceReviews-test.csv | Columns = 10 / 10 | Rows = 1689 -> 1689


   Clothing ID  Age                   Title  \
0          863   65             Thin & soft   
1          863   42            Poor quality   
2         1024   34               Run small   
3          831   48  Super cute peasant top   
4         1121   29         A wrinkled mess   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Review Text  \
0                                                                                                                                                The knit is thin.  it's more for a chilly spring or warm fall d

In [5]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


ValueError: Labels cannot contain missing (nan) values. Found 3 missing label values.

In [5]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

Accuracy = 83.49%


In [6]:
test_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
9764,30,Private,151868,9th,5,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
9765,32,State-gov,104509,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,25,United-States
9766,22,Private,187592,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,30,United-States
9767,32,Private,49539,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,3674,0,40,United-States
9768,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States
