In [1]:
#!pip install hyperopt

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
dataset = pd.read_csv('dataset.csv')

In [4]:
standardizer = StandardScaler()

## Using columns based on feature importance - with demographics data

In part-1 of this course, feature importance was selected based on RFE, LASSO and Random Forest.

Best results were obtained with feature from Random Forest. Below is the list of best columns based on Random Forest feature importance.

In [5]:
imp_features = ['CoffeeHouse_ord','age','income_lb','income_ub','Bar_ord','education','time','CarryAway_ord','Rest20To50_ord','RestLT20_ord','expiration','coupon_Carry out & Take away','temperature','coupon_Restaurant(<20)','coupon_Bar','coupon_Coffee House','toCoupon_GEQ15min','gender','has_children','maritalStatus_Single','maritalStatus_Married partner','occupation_Unemployed','coupon_Restaurant(20-50)','passanger_Friend(s)']

In [6]:
X = dataset[imp_features]
y = dataset['Y']

In [7]:
X

Unnamed: 0,CoffeeHouse_ord,age,income_lb,income_ub,Bar_ord,education,time,CarryAway_ord,Rest20To50_ord,RestLT20_ord,...,coupon_Bar,coupon_Coffee House,toCoupon_GEQ15min,gender,has_children,maritalStatus_Single,maritalStatus_Married partner,occupation_Unemployed,coupon_Restaurant(20-50),passanger_Friend(s)
0,0.0,21,37500,49999,0.0,3,14,2.0,2.0,3.0,...,0,0,0,1,1,0,0,1,0,0
1,0.0,21,37500,49999,0.0,3,10,2.0,2.0,3.0,...,0,1,0,1,1,0,0,1,0,1
2,0.0,21,37500,49999,0.0,3,10,2.0,2.0,3.0,...,0,0,1,1,1,0,0,1,0,1
3,0.0,21,37500,49999,0.0,3,14,2.0,2.0,3.0,...,0,1,1,1,1,0,0,1,0,1
4,0.0,21,37500,49999,0.0,3,14,2.0,2.0,3.0,...,0,1,1,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,0.0,26,75000,87499,0.0,5,18,2.0,2.0,3.0,...,0,0,0,0,0,1,0,0,0,0
12680,0.0,26,75000,87499,0.0,5,7,2.0,2.0,3.0,...,0,0,0,0,0,1,0,0,0,0
12681,0.0,26,75000,87499,0.0,5,7,2.0,2.0,3.0,...,0,1,0,0,0,1,0,0,0,0
12682,0.0,26,75000,87499,0.0,5,7,2.0,2.0,3.0,...,1,0,1,0,0,1,0,0,0,0


In [8]:
X_std = standardizer.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8878, 24), (3806, 24), (8878,), (3806,))

### Bayesian Optimization

In [10]:
#Space set close to values estimated by TPOT classifier
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'bootstrap': hp.choice('bootstrap', [True, False]),
        'max_features': hp.choice('max_features', [0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.choice ('min_samples_split', [5, 6, 7, 8, 9]),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750])
    }
space

{'criterion': <hyperopt.pyll.base.Apply at 0x1f0c021ba30>,
 'bootstrap': <hyperopt.pyll.base.Apply at 0x1f0c021bb80>,
 'max_features': <hyperopt.pyll.base.Apply at 0x1f0c021bcd0>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x1f0c021bf70>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x1f0c021a0d0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x1f0c021a2e0>}

In [11]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], bootstrap = space['bootstrap'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [12]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|███████████████████████████████████████████████| 80/80 [22:27<00:00, 16.84s/trial, best loss: -0.7414965105951022]


{'bootstrap': 1,
 'criterion': 1,
 'max_features': 3,
 'min_samples_leaf': 0.0003611725493302344,
 'min_samples_split': 0,
 'n_estimators': 2}

In [13]:
crit = {0: 'entropy', 1: 'gini'}
bootstrap = {0: True, 1: False}
feat = {0: 0.15, 1: 0.20, 2: 0.25, 3: 0.3, 4: 0.35, 5: 0.4, 6: 0.45, 7: 0.5}
split = {0: 5, 1: 6, 2: 7, 3: 8, 4: 9}
est = {0: 10, 1: 50, 2: 300, 3: 750}
#leaf = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}


print("criterion:",crit[best['criterion']])
print("bootstrap:",bootstrap[best['bootstrap']])
print("max_features:",feat[best['max_features']])
print("n_estimators:",est[best['n_estimators']])
print("min_samples_split:",split[best['min_samples_split']])
print("min_samples_leaf:",best['min_samples_leaf'])

criterion: gini
bootstrap: False
max_features: 0.3
n_estimators: 300
min_samples_split: 5
min_samples_leaf: 0.0003611725493302344


In [14]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], bootstrap = bootstrap[best['bootstrap']], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = split[best['min_samples_split']], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)

In [15]:
predictionforest = trainedforest.predict(X_test)

print("Accuracy:",accuracy_score(y_test,predictionforest)) 
print("F1:",f1_score(y_test,predictionforest))

Accuracy: 0.7524960588544404
F1: 0.7928759894459102


In [16]:
print(classification_report(y_test,predictionforest))

              precision    recall  f1-score   support

           0       0.74      0.65      0.69      1622
           1       0.76      0.83      0.79      2184

    accuracy                           0.75      3806
   macro avg       0.75      0.74      0.74      3806
weighted avg       0.75      0.75      0.75      3806



## Using columns based on feature importance - without demographics data

In [17]:
imp_features = ['CoffeeHouse_ord', 'Bar_ord', 'CarryAway_ord', 'RestLT20_ord', 'Rest20To50_ord', 'time', 'expiration', 'coupon_Carry out & Take away', 'temperature', 'toCoupon_GEQ15min', 'coupon_Restaurant(<20)', 'coupon_Bar','coupon_Coffee House','destination_No Urgent Place', 'coupon_Restaurant(20-50)', 'weather_Sunny', 'toCoupon_GEQ25min', 'destination_Home', 'direction_same', 'direction_opp', 'destination_Work', 'weather_Rainy', 'weather_Snowy']

In [18]:
X = dataset[imp_features]
y = dataset['Y']

In [19]:
X

Unnamed: 0,CoffeeHouse_ord,Bar_ord,CarryAway_ord,RestLT20_ord,Rest20To50_ord,time,expiration,coupon_Carry out & Take away,temperature,toCoupon_GEQ15min,...,destination_No Urgent Place,coupon_Restaurant(20-50),weather_Sunny,toCoupon_GEQ25min,destination_Home,direction_same,direction_opp,destination_Work,weather_Rainy,weather_Snowy
0,0.0,0.0,2.0,3.0,2.0,14,24,0,55,0,...,1,0,1,0,0,0,1,0,0,0
1,0.0,0.0,2.0,3.0,2.0,10,2,0,80,0,...,1,0,1,0,0,0,1,0,0,0
2,0.0,0.0,2.0,3.0,2.0,10,2,1,80,1,...,1,0,1,0,0,0,1,0,0,0
3,0.0,0.0,2.0,3.0,2.0,14,2,0,80,1,...,1,0,1,0,0,0,1,0,0,0
4,0.0,0.0,2.0,3.0,2.0,14,24,0,80,1,...,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,0.0,0.0,2.0,3.0,2.0,18,24,1,55,0,...,0,0,0,0,1,1,0,0,1,0
12680,0.0,0.0,2.0,3.0,2.0,7,24,1,55,0,...,0,0,0,0,0,0,1,1,1,0
12681,0.0,0.0,2.0,3.0,2.0,7,24,0,30,0,...,0,0,0,0,0,1,0,1,0,1
12682,0.0,0.0,2.0,3.0,2.0,7,24,0,30,1,...,0,0,0,1,0,0,1,1,0,1


In [20]:
X_std = standardizer.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8878, 23), (3806, 23), (8878,), (3806,))

### Bayesian Optimization

In [22]:
#Space set close to values estimated by TPOT classifier
space = {'subsample': hp.choice('subsample', [0.4, 0.6, 0.8, 1.0]),
        'colsample_bytree': hp.choice('colsample_bytree', [0.5, 0.75, 1.0]),
        'learning_rate': hp.choice('learning_rate', [0.5, 0.1, 0.05, 0.01]),
        'max_depth' : hp.choice ('max_depth', [3, 4, 5, 6, 7, 8, 9]),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750])
    }
space

{'subsample': <hyperopt.pyll.base.Apply at 0x1f0c09e0580>,
 'colsample_bytree': <hyperopt.pyll.base.Apply at 0x1f0c021acd0>,
 'learning_rate': <hyperopt.pyll.base.Apply at 0x1f0c09e0e20>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x1f0c09e0fd0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x1f0c09d4f40>}

In [23]:
def objective(space):
    model = XGBClassifier(subsample = space['subsample'], colsample_bytree = space['colsample_bytree'],
                                 learning_rate = space['learning_rate'],
                                 max_depth = space['max_depth'],
                                 n_estimators = space['n_estimators'], use_label_encoder = False
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [24]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best





























100%|███████████████████████████████████████████████| 80/80 [17:12<00:00, 12.91s/trial, best loss: -0.7315848242608807]


{'colsample_bytree': 2,
 'learning_rate': 2,
 'max_depth': 1,
 'n_estimators': 2,
 'subsample': 2}

In [25]:
subsample = {0: 0.4, 1: 0.6, 2: 0.8, 3: 1.0}
colsample_bytree = {0: 0.5, 1: 0.75, 2: 1.0}
depth = {0: 3, 1: 4, 2: 5, 3: 6, 4: 7, 5: 8, 6: 9}
est = {0: 10, 1: 50, 2: 300, 3: 750}
lr = {0: 0.5, 1: 0.1, 2: 0.05, 3: 0.01}


print("subsample:",subsample[best['subsample']])
print("colsample_bytree:",colsample_bytree[best['colsample_bytree']])
print("n_estimators:",est[best['n_estimators']])
print("max_depth:",depth[best['max_depth']])
print("learning_rate:",lr[best['learning_rate']])

subsample: 0.8
colsample_bytree: 1.0
n_estimators: 300
max_depth: 4
learning_rate: 0.05


In [26]:
trainedxgb = XGBClassifier(subsample = subsample[best['subsample']], colsample_bytree = colsample_bytree[best['colsample_bytree']], 
                                       learning_rate = lr[best['learning_rate']], 
                                       max_depth = depth[best['max_depth']], 
                                       n_estimators = est[best['n_estimators']], use_label_encoder = False).fit(X_train,y_train)



In [27]:
predictionxgb = trainedxgb.predict(X_test)

print("Accuracy:",accuracy_score(y_test,predictionxgb)) 
print("F1:",f1_score(y_test,predictionxgb))

Accuracy: 0.7341040462427746
F1: 0.7751111111111112


In [28]:
print(classification_report(y_test,predictionxgb))

              precision    recall  f1-score   support

           0       0.72      0.63      0.67      1654
           1       0.74      0.81      0.78      2152

    accuracy                           0.73      3806
   macro avg       0.73      0.72      0.72      3806
weighted avg       0.73      0.73      0.73      3806

