## Imports

In [6]:
import pandas as pd

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

#### Initialize TPOT optimizer

In [7]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=257, verbosity=2)

## Load dataset

In [8]:
df = pd.read_csv('in-vehicle-coupon-recommendation.csv')
display(list(df.columns.values))
df.rename(columns={"passanger": "passenger"}, inplace=True)
df

['destination',
 'passanger',
 'weather',
 'temperature',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'has_children',
 'education',
 'occupation',
 'income',
 'car',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp',
 'Y']

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,1
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,0,1,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,1,1,0,1,0


## Fill missing values in data
- Count NAs per column
- Make a list of columns containing NAs
- Do not list columns containing more than 50% NAs<br>(insufficient data for replacement)
- Replace NAs with mode in all listed columns

(only to be used for categorical features,<br>numerical ones would require selecting the mean)

In [9]:
df.isna().sum()

destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

In [10]:
columns_withNA = []

for index, value in df.isna().sum().items():
    if value>0 and value < (df.shape[0] / 2):
        columns_withNA.append(index)
print(columns_withNA)

for column in columns_withNA:
    print('Column is: ' + column)
    print('Mode is: ' + df[column].mode()[0])
    print()
    df[column] = df[column].fillna( df[column].mode()[0])

df.isna().sum()

['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
Column is: Bar
Mode is: never

Column is: CoffeeHouse
Mode is: less1

Column is: CarryAway
Mode is: 1~3

Column is: RestaurantLessThan20
Mode is: 1~3

Column is: Restaurant20To50
Mode is: less1



destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                         0
CoffeeHouse                 0
CarryAway                   0
RestaurantLessThan20        0
Restaurant20To50            0
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

## One-hot-encoding
- make a list of all categorical features
- get an overview of occurring values
- drop all binary labeled features (already OHE)
- for each feature create new columns, one for each distinct value using pd.get_dummies()<br>
'FEATURE_IS_VALUE' (e.g. 'destination_IS_Home')

In [11]:
dfOHE = df
featuresToBeOHE = df.columns.drop('temperature')

for feature in featuresToBeOHE:
    print('Current feature: ' + feature)
    valueArray = df[feature].value_counts(dropna=False).index
    for value in valueArray:
        print(value)
    print()

featuresToBeOHE = featuresToBeOHE.drop(labels=['has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'Y'])
print(featuresToBeOHE)

Current feature: destination
No Urgent Place
Home
Work

Current feature: passenger
Alone
Friend(s)
Partner
Kid(s)

Current feature: weather
Sunny
Snowy
Rainy

Current feature: time
6PM
7AM
10AM
2PM
10PM

Current feature: coupon
Coffee House
Restaurant(<20)
Carry out & Take away
Bar
Restaurant(20-50)

Current feature: expiration
1d
2h

Current feature: gender
Female
Male

Current feature: age
21
26
31
50plus
36
41
46
below21

Current feature: maritalStatus
Married partner
Single
Unmarried partner
Divorced
Widowed

Current feature: has_children
0
1

Current feature: education
Some college - no degree
Bachelors degree
Graduate degree (Masters or Doctorate)
Associates degree
High School Graduate
Some High School

Current feature: occupation
Unemployed
Student
Computer & Mathematical
Sales & Related
Education&Training&Library
Management
Office & Administrative Support
Arts Design Entertainment Sports & Media
Business & Financial
Retired
Food Preparation & Serving Related
Healthcare Practiti

In [12]:
dfOHE = pd.get_dummies(dfOHE, prefix_sep='_IS_', columns=featuresToBeOHE, dummy_na=False)
buffer = dfOHE.pop('Y')
dfOHE['Y'] = buffer
dfOHE.head()


Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,destination_IS_Home,destination_IS_No Urgent Place,destination_IS_Work,...,RestaurantLessThan20_IS_4~8,RestaurantLessThan20_IS_gt8,RestaurantLessThan20_IS_less1,RestaurantLessThan20_IS_never,Restaurant20To50_IS_1~3,Restaurant20To50_IS_4~8,Restaurant20To50_IS_gt8,Restaurant20To50_IS_less1,Restaurant20To50_IS_never,Y
0,55,1,1,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
1,80,1,1,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0
2,80,1,1,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
3,80,1,1,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0
4,80,1,1,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0


## Train-Test-Split

In [13]:
df_train, df_test = train_test_split(dfOHE, random_state=257)
X_train = df_train.drop(columns=['Y'])
y_train = df_train['Y']

df_test = df_test.dropna()
X_test = df_test.drop(columns=['Y'])
y_test = df_test['Y']

## Run autoML pipeline

In [14]:
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.736886138605655

Generation 2 - Current best internal CV score: 0.7447699768973998

Generation 3 - Current best internal CV score: 0.7447699768973998

Generation 4 - Current best internal CV score: 0.7463465456335754

Generation 5 - Current best internal CV score: 0.7463465456335754

Best pipeline: ExtraTreesClassifier(RandomForestClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.1, min_samples_leaf=2, min_samples_split=10, n_estimators=100), bootstrap=False, criterion=gini, max_features=0.7500000000000001, min_samples_leaf=14, min_samples_split=4, n_estimators=100)


TPOTClassifier(generations=5, population_size=20, random_state=257, verbosity=2)

In [15]:
print(pipeline_optimizer.score(X_test, y_test))

0.7584358246609902


In [16]:
pipeline_optimizer.export('tpot_exported_pipeline.py')