In [5]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
data = TabularDataset('./datasets/nlp-getting-started/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
#train_data = train_data.head(500)  # subsample for faster demo
print(data.head())
label = 'target'  # specifies which column do we want to predict
save_path = 'sentiment_analysis_all_instances/'  # where to save trained models



   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [9]:
rows = len(data.index)

In [10]:
train_data = data.iloc[:int(0.8*rows),]

In [11]:
test_data = data.iloc[int(0.8*rows):,]

In [12]:
test_data.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [13]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600, presets='best_quality')
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "sentiment_analysis/"
AutoGluon Version:  0.2.0
Train Data Rows:    6090
Train Data Columns: 4
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    430915.68 MB
	Train Data (Original)  Memory Usage: 1.74 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitt

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           CatBoost_BAG_L2   0.804926       4.213077  2420.577271                0.467541          28.306417            2       True         15
1       WeightedEnsemble_L3   0.804926       4.222094  2422.437293                0.009017           1.860021            3       True         19
2            XGBoost_BAG_L2   0.802135       4.045090  2411.830435                0.299554          19.559580            2       True         18
3         LightGBMXT_BAG_L2   0.800657       3.977075  3514.814481                0.231538        1122.543626            2       True         12
4       WeightedEnsemble_L2   0.798686       2.921751  2392.569174                0.009828           1.752869            2       True          9
5         LightGBMXT_BAG_L1   0.798522       0.224371  2082.322057  

In [14]:
# Inference time:
# test_data = TabularDataset('./datasets/nlp-getting-started/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



        id  keyword                   location  \
6090  8697  sinking  North East Unsigned Radio   
6091  8698  sinking   Every Where in the World   
6092  8699  sinking                    Memphis   
6093  8700  sinking                        NaN   
6094  8702  sinking                        NaN   

                                                                                                                                           text  
6090                                      #nowplaying Sinking Fast - Now or Never on North East Unsigned Radio listen at http://t.co/QymAlttvZp  
6091          that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time  
6092                                                     Nigga car sinking but he snapping it up for fox 13. #priorities http://t.co/9StLKH59Fb  
6093  @abandonedpics You should delete this one it's not an abbandoned nor sinking. ThatÛªs the darsena of the Cas

In [15]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


Evaluation: accuracy on test data: 0.7334208798424163
Evaluations on test data:
{
    "accuracy": 0.7334208798424163,
    "balanced_accuracy": 0.7211406174734807,
    "mcc": 0.47845757438928815,
    "f1": 0.6547619047619048,
    "precision": 0.8244111349036403,
    "recall": 0.5430183356840621
}


In [16]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

Accuracy = 73.34%


In [6]:
test_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
9764,30,Private,151868,9th,5,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
9765,32,State-gov,104509,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,25,United-States
9766,22,Private,187592,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,30,United-States
9767,32,Private,49539,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,3674,0,40,United-States
9768,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States
