## Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

## Lade Datensatz

In [2]:
df = pd.read_csv('in-vehicle-coupon-recommendation.csv')
display(list(df.columns.values))
df.rename(columns={"passanger": "passenger"}, inplace=True)
df

['destination',
 'passanger',
 'weather',
 'temperature',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'has_children',
 'education',
 'occupation',
 'income',
 'car',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp',
 'Y']

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,1
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,0,1,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,1,1,0,1,0


## Fill missing values in data
- Count NAs per column
- For each column containing NAs pick the mode
- Replace NAs with mode if mode is not NaN (insufficient data)

(only works because all affected features are categorical)

In [3]:
df.isna().sum()

destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

In [4]:
for index, value in df.isna().sum().items():
    if value>0:
        print('Current Column: \'' + index + '\'')
        print(df[index].value_counts(dropna=False))
        mode = df[index].value_counts(dropna=False).index[0]
        print()
        if str(mode) == 'nan':
            print('Not filling in Column \'' + index + '\' due to lack of data.')
        else:
            print('Filling Column \'' + index + '\' with mode: ' + str(mode))
            df[index] = df[index].fillna(mode)
            print(df[index].value_counts(dropna=False))
        print('\n')

df.isna().sum()

Current Column: 'car'
NaN                                         12576
Mazda5                                         22
do not drive                                   22
Scooter and motorcycle                         22
Car that is too old to install Onstar :D       21
crossover                                      21
Name: car, dtype: int64

Not filling in Column 'car' due to lack of data.


Current Column: 'Bar'
never    5197
less1    3482
1~3      2473
4~8      1076
gt8       349
NaN       107
Name: Bar, dtype: int64

Filling Column 'Bar' with mode: never
never    5304
less1    3482
1~3      2473
4~8      1076
gt8       349
Name: Bar, dtype: int64


Current Column: 'CoffeeHouse'
less1    3385
1~3      3225
never    2962
4~8      1784
gt8      1111
NaN       217
Name: CoffeeHouse, dtype: int64

Filling Column 'CoffeeHouse' with mode: less1
less1    3602
1~3      3225
never    2962
4~8      1784
gt8      1111
Name: CoffeeHouse, dtype: int64


Current Column: 'CarryAway'
1~3      467

destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                         0
CoffeeHouse                 0
CarryAway                   0
RestaurantLessThan20        0
Restaurant20To50            0
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

## One-hot-encoding
- make a list of all categorical features
- get an overview of occurring values
- drop all binary labeled features (already OHE)
- for each feature create new columns, one for each distinct value<br>
FEATURE_IS_VALUE (e.g. destination_IS_Home)
- drop old features

In [5]:
dfOHE = df
featuresToBeOHE = df.columns.drop('temperature')

for feature in featuresToBeOHE:
    print('Current feature: ' + feature)
    valueArray = df[feature].value_counts(dropna=False).index
    for value in valueArray:
        print(value)
    print()

featuresToBeOHE = featuresToBeOHE.drop(labels=['has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'Y'])
print(featuresToBeOHE)

Current feature: destination
No Urgent Place
Home
Work

Current feature: passenger
Alone
Friend(s)
Partner
Kid(s)

Current feature: weather
Sunny
Snowy
Rainy

Current feature: time
6PM
7AM
10AM
2PM
10PM

Current feature: coupon
Coffee House
Restaurant(<20)
Carry out & Take away
Bar
Restaurant(20-50)

Current feature: expiration
1d
2h

Current feature: gender
Female
Male

Current feature: age
21
26
31
50plus
36
41
46
below21

Current feature: maritalStatus
Married partner
Single
Unmarried partner
Divorced
Widowed

Current feature: has_children
0
1

Current feature: education
Some college - no degree
Bachelors degree
Graduate degree (Masters or Doctorate)
Associates degree
High School Graduate
Some High School

Current feature: occupation
Unemployed
Student
Computer & Mathematical
Sales & Related
Education&Training&Library
Management
Office & Administrative Support
Arts Design Entertainment Sports & Media
Business & Financial
Retired
Food Preparation & Serving Related
Healthcare Practiti

In [6]:
df.head()

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,1~3,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,1~3,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,1~3,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,1~3,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,1~3,4~8,1~3,1,1,0,0,1,0


In [7]:
for feature in featuresToBeOHE:
    print('Current feature: ' + feature)
    valueArray = df[feature].value_counts(dropna=False).index
    print(dfOHE[feature].value_counts())
    for value in valueArray:
        dfOHE[str(feature) + '_IS_' + str(value)] = df[feature].map(lambda x: 1 if x==value else 0)
        print('  Current value: ' + str(value))
        print(dfOHE[str(feature) + '_IS_' + str(value)].value_counts())
    print()

dfOHE = dfOHE.drop(columns=featuresToBeOHE)

Current feature: destination
No Urgent Place    6283
Home               3237
Work               3164
Name: destination, dtype: int64
  Current value: No Urgent Place
0    6401
1    6283
Name: destination_IS_No Urgent Place, dtype: int64
  Current value: Home
0    9447
1    3237
Name: destination_IS_Home, dtype: int64
  Current value: Work
0    9520
1    3164
Name: destination_IS_Work, dtype: int64

Current feature: passenger
Alone        7305
Friend(s)    3298
Partner      1075
Kid(s)       1006
Name: passenger, dtype: int64
  Current value: Alone
1    7305
0    5379
Name: passenger_IS_Alone, dtype: int64
  Current value: Friend(s)
0    9386
1    3298
Name: passenger_IS_Friend(s), dtype: int64
  Current value: Partner
0    11609
1     1075
Name: passenger_IS_Partner, dtype: int64
  Current value: Kid(s)
0    11678
1     1006
Name: passenger_IS_Kid(s), dtype: int64

Current feature: weather
Sunny    10069
Snowy     1405
Rainy     1210
Name: weather, dtype: int64
  Current value: Sunny
1

In [8]:
dfOHE

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_IS_No Urgent Place,destination_IS_Home,...,RestaurantLessThan20_IS_1~3,RestaurantLessThan20_IS_4~8,RestaurantLessThan20_IS_less1,RestaurantLessThan20_IS_gt8,RestaurantLessThan20_IS_never,Restaurant20To50_IS_less1,Restaurant20To50_IS_1~3,Restaurant20To50_IS_never,Restaurant20To50_IS_4~8,Restaurant20To50_IS_gt8
0,55,1,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,1,0,0,0
1,80,1,1,0,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,80,1,1,1,0,0,1,1,1,0,...,0,1,0,0,0,0,1,0,0,0
3,80,1,1,1,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
4,80,1,1,1,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,55,0,1,0,0,1,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
12680,55,0,1,0,0,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0
12681,30,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
12682,30,0,1,1,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0


## Train-Test-Split

In [9]:
df_train, df_test = train_test_split(dfOHE)
X_train = df_train.drop(columns=['Y'])
y_train = df_train['Y']

df_test = df_test.dropna()
X_test = df_test.drop(columns=['Y'])
y_test = df_test['Y']

## Scaling

In [10]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

## Training
#### Logistic regression: C :[0.1,1,4,5,6,10,30,100] und penalty: ["l1", "l2"]

In [11]:
parameter_candidates = [{'C': [0.1,1,4,5,6,10,30,100], 'penalty': ["l1", "l2"]}]
log = LogisticRegression(max_iter=1000,random_state=0)
clf = GridSearchCV(estimator=log, param_grid=parameter_candidates, n_jobs=-1)
clf = clf.fit(X_train_scaled, y_train)
print('Best penalty:', clf.best_estimator_.penalty)
print('Best C:', clf.best_estimator_.C)

Best penalty: l2
Best C: 0.1


In [12]:
logisticRegr = LogisticRegression(max_iter=1000,C=clf.best_estimator_.C,penalty =clf.best_estimator_.penalty)
logisticRegr.fit(X_train_scaled, y_train)

LogisticRegression(C=0.1, max_iter=1000)

#### Random Forest: n_estimators: [60,80,100,120,140] und max_depth: [2, 3, 4, 5]

In [13]:
parameter_candidates = [{'n_estimators': [60,80,100,120,140], 'max_depth': [2, 3, 4, 5]}]
RFC = RandomForestClassifier(random_state=0)
clf = GridSearchCV(estimator=RFC, param_grid=parameter_candidates, n_jobs=-1)
clf = clf.fit(X_train_scaled, y_train)
print('Best n:', clf.best_estimator_.n_estimators)
print('Best depth:', clf.best_estimator_.max_depth)

Best n: 80
Best depth: 5


In [14]:
model = RandomForestClassifier(random_state=0,max_depth=clf.best_estimator_.max_depth,n_estimators=clf.best_estimator_.n_estimators)
model.fit(X_train_scaled,y_train)

RandomForestClassifier(max_depth=5, n_estimators=80, random_state=0)

#### Gradient Boosting Tree: hyper params like in Random Forest.

In [15]:

parameter_candidates = [{'n_estimators': [60,80,100,120,140], 'max_depth': [2, 3, 4, 5]}]
gbt = GradientBoostingClassifier(random_state=0)
clf = GridSearchCV(estimator=gbt, param_grid=parameter_candidates, n_jobs=-1)
clf = clf.fit(X_train_scaled, y_train)
print('Best n:', clf.best_estimator_.n_estimators)
print('Best depth:', clf.best_estimator_.max_depth)

Best n: 140
Best depth: 5


In [16]:
gbt = GradientBoostingClassifier(random_state=0,max_depth=clf.best_estimator_.max_depth,n_estimators=clf.best_estimator_.n_estimators)
gbt.fit(X_train_scaled,y_train)

GradientBoostingClassifier(max_depth=5, n_estimators=140, random_state=0)

## Scale test data

In [17]:
X_test_scaled = scaler.transform(X_test)


## Predictions
#### Logistic regression

In [18]:
predictions_log = logisticRegr.predict(X_test_scaled)

#### Random Forest

In [19]:
predictions_rf = model.predict(X_test_scaled)

#### Gradient Boosting

In [20]:
predictions_gd = gbt.predict(X_test_scaled)

## Accuracy, precision and recall

#### Logistic regression

In [21]:
print("Accuracy score is:")
print(accuracy_score(y_test, predictions_log))
print("Recall is:")
print(recall_score(y_test, predictions_log))
print("Precision is:")
print(precision_score(y_test, predictions_log))

Accuracy score is:
0.6881110059918006
Recall is:
0.7754988913525499
Precision is:
0.7054967221381745


#### Random Forest

In [22]:
print("Accuracy score is:")
print(accuracy_score(y_test, predictions_rf))
print("Recall is:")
print(recall_score(y_test, predictions_rf))
print("Precision is:")
print(precision_score(y_test, predictions_rf))

Accuracy score is:
0.6808577735730054
Recall is:
0.8841463414634146
Precision is:
0.6651376146788991


#### Gradient Boosting

In [23]:
print("Accuracy score is:")
print(accuracy_score(y_test, predictions_gd))
print("Recall is:")
print(recall_score(y_test, predictions_gd))
print("Precision is:")
print(precision_score(y_test, predictions_gd))

Accuracy score is:
0.7590665405234942
Recall is:
0.8242793791574279
Precision is:
0.7688728024819028


In [24]:
df['Y'].value_counts()

1    7210
0    5474
Name: Y, dtype: int64