In [2]:
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset('./datasets/nlp-getting-started/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'target'  # specifies which column do we want to predict
save_path = 'sentiment_analysis/'  # where to save trained models



   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [4]:
# splitting dataframe in a particular size
df_split = train_data.sample(frac=1,random_state=200)
df_split.reset_index()

Unnamed: 0,index,id,keyword,location,text,target
0,157,225,airplane%20accident,"Lehigh Valley, PA",Strict liability in the context of an airplane...,1
1,88,130,accident,"Manchester, NH",Accident left lane blocked in #Manchester on R...,1
2,383,552,arson,"Charlotte, NC",Add Familia to the arson squad.,0
3,125,180,aftershock,304,Sometimes you face difficulties not because yo...,0
4,445,644,arsonist,,Big Top Burning The True Story Of An Arsonist ...,1
...,...,...,...,...,...,...
495,298,438,apocalypse,Las Vegas,I know where to go when the zombies take over!...,0
496,324,470,armageddon,Nowhere. Everywhere.,@RohnertParkDPS You're another one for the his...,0
497,272,396,apocalypse,ColoRADo,I'm gonna fight Taylor as soon as I get there.,0
498,105,153,aftershock,304,'There is no victory at bargain basement price...,0


In [5]:
train_data = df_split.iloc[:400,]

In [6]:
train_data.tail()


Unnamed: 0,id,keyword,location,text,target
337,484,armageddon,,#Christians United for #Israel (#CUFI): Jews s...,0
477,687,attack,,Heart disease prevention: What about secondhan...,0
234,334,annihilated,,@TomcatArts thus explaining why you were all a...,1
220,313,annihilated,,Cop pulls drunk driver to safety SECONDS befor...,1
382,551,arson,USA,Thousands attend a rally organized by Peace No...,1


In [8]:
test_data = df_split.iloc[400:,]

In [9]:
test_data.tail()

Unnamed: 0,id,keyword,location,text,target
298,438,apocalypse,Las Vegas,I know where to go when the zombies take over!...,0
324,470,armageddon,Nowhere. Everywhere.,@RohnertParkDPS You're another one for the his...,0
272,396,apocalypse,ColoRADo,I'm gonna fight Taylor as soon as I get there.,0
105,153,aftershock,304,'There is no victory at bargain basement price...,0
282,412,apocalypse,Oakland,Julie + R is the apocalypse version of Romeo +...,0


In [10]:
predictor = TabularPredictor(label=label, path=save_path).fit(train_data, time_limit=3600, presets='best_quality')
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()


Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "sentiment_analysis/"
AutoGluon Version:  0.2.0
Train Data Rows:    400
Train Data Columns: 4
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    491054.64 MB
	Train Data (Original)  Memory Usage: 0.12 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitti

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2     0.8075       0.214711     9.916116                0.001120           0.581162            2       True         14
1           CatBoost_BAG_L1     0.8050       0.097949     9.329972                0.097949           9.329972            1       True          7
2      LightGBMLarge_BAG_L1     0.7725       0.166701  1586.481089                0.166701        1586.481089            1       True         13
3         LightGBMXT_BAG_L1     0.7650       0.170517   392.634122                0.170517         392.634122            1       True          3
4           LightGBM_BAG_L1     0.7600       0.160803   368.991084                0.160803         368.991084            1       True          4
5   RandomForestGini_BAG_L1     0.7500       0.114062     1.089460  

In [11]:
# Inference time:
# test_data = TabularDataset('./datasets/nlp-getting-started/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())



      id              keyword              location  \
155  221  airplane%20accident    Not a U.S resident   
293  429           apocalypse         Harlingen, TX   
38    56               ablaze                   NaN   
82   119             accident                   NaN   
173  248            ambulance  New York / Worldwide   

                                                                                                                                             text  
155                                                                        Usama bin Ladins family dead in airplane crash. Naturally no accident.  
293                                                            My niece just asked me 'would you be scared if there was an apocalypse here?' ????  
38   Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J  
82                                                               Can wait to 

In [12]:
predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)


Evaluation: accuracy on test data: 0.81
Evaluations on test data:
{
    "accuracy": 0.81,
    "balanced_accuracy": 0.8087113608992373,
    "mcc": 0.6182920693179746,
    "f1": 0.795698924731183,
    "precision": 0.8043478260869565,
    "recall": 0.7872340425531915
}


In [5]:
print('Accuracy = {:.2f}%'.format(perf["accuracy"] * 100))

Accuracy = 83.49%


In [6]:
test_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
9764,30,Private,151868,9th,5,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
9765,32,State-gov,104509,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,25,United-States
9766,22,Private,187592,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,30,United-States
9767,32,Private,49539,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,3674,0,40,United-States
9768,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States
