In [1]:
#!pip install tpot

In [2]:
#!pip install ipywidgets jupyter nbextension enable --py widgetsnbextension

In [3]:
from tpot import TPOTClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
dataset = pd.read_csv('dataset.csv')

In [5]:
standardizer = StandardScaler()

### Using all columns

In [6]:
X = dataset.drop(axis=1, columns=['Y'])
y = dataset['Y']

In [7]:
X_std = standardizer.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8878, 64), (3806, 64), (8878,), (3806,))

In [9]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=30)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


31.94 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=6, max_features=0.9000000000000001, min_samples_leaf=16, min_samples_split=6, n_estimators=100, subsample=1.0)
0.755386232264845


In [10]:
tpot.fitted_pipeline_

Pipeline(steps=[('gradientboostingclassifier',
                 GradientBoostingClassifier(learning_rate=0.5, max_depth=6,
                                            max_features=0.9000000000000001,
                                            min_samples_leaf=16,
                                            min_samples_split=6))])

In [11]:
print(tpot.score(X_test, y_test))

0.755386232264845


In [12]:
tpot.export('tpot_coupon_dataset_all_columns.py')

### Using columns based on feature importance - with demographics data

In part-1 of this course, feature importance was selected based on RFE, LASSO and Random Forest.

Best results were obtained with feature from Random Forest. Below is the list of best columns based on Random Forest feature importance.

In [13]:
imp_features = ['CoffeeHouse_ord','age','income_lb','income_ub','Bar_ord','education','time','CarryAway_ord','Rest20To50_ord','RestLT20_ord','expiration','coupon_Carry out & Take away','temperature','coupon_Restaurant(<20)','coupon_Bar','coupon_Coffee House','toCoupon_GEQ15min','gender','has_children','maritalStatus_Single','maritalStatus_Married partner','occupation_Unemployed','coupon_Restaurant(20-50)','passanger_Friend(s)']

In [14]:
X = dataset[imp_features]
y = dataset['Y']

In [15]:
X

Unnamed: 0,CoffeeHouse_ord,age,income_lb,income_ub,Bar_ord,education,time,CarryAway_ord,Rest20To50_ord,RestLT20_ord,...,coupon_Bar,coupon_Coffee House,toCoupon_GEQ15min,gender,has_children,maritalStatus_Single,maritalStatus_Married partner,occupation_Unemployed,coupon_Restaurant(20-50),passanger_Friend(s)
0,0.0,21,37500,49999,0.0,3,14,2.0,2.0,3.0,...,0,0,0,1,1,0,0,1,0,0
1,0.0,21,37500,49999,0.0,3,10,2.0,2.0,3.0,...,0,1,0,1,1,0,0,1,0,1
2,0.0,21,37500,49999,0.0,3,10,2.0,2.0,3.0,...,0,0,1,1,1,0,0,1,0,1
3,0.0,21,37500,49999,0.0,3,14,2.0,2.0,3.0,...,0,1,1,1,1,0,0,1,0,1
4,0.0,21,37500,49999,0.0,3,14,2.0,2.0,3.0,...,0,1,1,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,0.0,26,75000,87499,0.0,5,18,2.0,2.0,3.0,...,0,0,0,0,0,1,0,0,0,0
12680,0.0,26,75000,87499,0.0,5,7,2.0,2.0,3.0,...,0,0,0,0,0,1,0,0,0,0
12681,0.0,26,75000,87499,0.0,5,7,2.0,2.0,3.0,...,0,1,0,0,0,1,0,0,0,0
12682,0.0,26,75000,87499,0.0,5,7,2.0,2.0,3.0,...,1,0,1,0,0,1,0,0,0,0


In [16]:
X_std = standardizer.fit_transform(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8878, 24), (3806, 24), (8878,), (3806,))

In [18]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=30)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


30.23 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestClassifier(MaxAbsScaler(input_matrix), bootstrap=False, criterion=entropy, max_features=0.25, min_samples_leaf=2, min_samples_split=7, n_estimators=100)
0.7574881765633211


In [19]:
tpot.fitted_pipeline_

Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False, criterion='entropy',
                                        max_features=0.25, min_samples_leaf=2,
                                        min_samples_split=7))])

In [20]:
print(tpot.score(X_test, y_test))

0.7574881765633211


In [21]:
tpot.export('tpot_coupon_dataset_feature_imp_demographics.py')

### Using columns based on feature importance - without demographics data

In [25]:
imp_features = ['CoffeeHouse_ord', 'Bar_ord', 'CarryAway_ord', 'RestLT20_ord', 'Rest20To50_ord', 'time', 'expiration', 'coupon_Carry out & Take away', 'temperature', 'toCoupon_GEQ15min', 'coupon_Restaurant(<20)', 'coupon_Bar','coupon_Coffee House','destination_No Urgent Place', 'coupon_Restaurant(20-50)', 'weather_Sunny', 'toCoupon_GEQ25min', 'destination_Home', 'direction_same', 'direction_opp', 'destination_Work', 'weather_Rainy', 'weather_Snowy']

In [26]:
X = dataset[imp_features]
y = dataset['Y']

In [27]:
X

Unnamed: 0,CoffeeHouse_ord,Bar_ord,CarryAway_ord,RestLT20_ord,Rest20To50_ord,time,expiration,coupon_Carry out & Take away,temperature,toCoupon_GEQ15min,...,destination_No Urgent Place,coupon_Restaurant(20-50),weather_Sunny,toCoupon_GEQ25min,destination_Home,direction_same,direction_opp,destination_Work,weather_Rainy,weather_Snowy
0,0.0,0.0,2.0,3.0,2.0,14,24,0,55,0,...,1,0,1,0,0,0,1,0,0,0
1,0.0,0.0,2.0,3.0,2.0,10,2,0,80,0,...,1,0,1,0,0,0,1,0,0,0
2,0.0,0.0,2.0,3.0,2.0,10,2,1,80,1,...,1,0,1,0,0,0,1,0,0,0
3,0.0,0.0,2.0,3.0,2.0,14,2,0,80,1,...,1,0,1,0,0,0,1,0,0,0
4,0.0,0.0,2.0,3.0,2.0,14,24,0,80,1,...,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,0.0,0.0,2.0,3.0,2.0,18,24,1,55,0,...,0,0,0,0,1,1,0,0,1,0
12680,0.0,0.0,2.0,3.0,2.0,7,24,1,55,0,...,0,0,0,0,0,0,1,1,1,0
12681,0.0,0.0,2.0,3.0,2.0,7,24,0,30,0,...,0,0,0,0,0,1,0,1,0,1
12682,0.0,0.0,2.0,3.0,2.0,7,24,0,30,1,...,0,0,0,1,0,0,1,1,0,1


In [28]:
X_std = standardizer.fit_transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8878, 23), (3806, 23), (8878,), (3806,))

In [30]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=30)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


30.28 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.4, verbosity=0)
0.7341040462427746


In [31]:
tpot.fitted_pipeline_

Pipeline(steps=[('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=6,
                               min_child_weight=4, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=1, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=0.4,
                               tree_method='exact', validate_parameters=1,
                               verbosity=0))])

In [32]:
print(tpot.score(X_test, y_test))

0.7341040462427746


In [33]:
tpot.export('tpot_coupon_dataset_feature_imp.py')