 ## Imports

In [638]:
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from torch import nn
from torch.utils.data import DataLoader
from torchvision.transforms import Lambda, ToTensor

## Load dataset

In [639]:
df = pd.read_csv('in-vehicle-coupon-recommendation.csv')
display(list(df.columns.values))
df.rename(columns={"passanger": "passenger"}, inplace=True)
df

['destination',
 'passanger',
 'weather',
 'temperature',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'has_children',
 'education',
 'occupation',
 'income',
 'car',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp',
 'Y']

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,1
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,0,1,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,0,0,1,0,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,...,never,1~3,4~8,1~3,1,1,1,0,1,0


## Fill missing values in data
- Count NAs per column
- Make a list of columns containing NAs
- Do not list columns containing more than 50% NAs<br>(insufficient data for replacement)
- Replace NAs with mode in all listed columns

(only to be used for categorical features,<br>numerical ones would require selecting the mean)

In [640]:
df.isna().sum()

destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

In [641]:
columns_withNA = []

for index, value in df.isna().sum().items():
    if value>0 and value < (df.shape[0] / 2):
        columns_withNA.append(index)
print(columns_withNA)

for column in columns_withNA:
    print('Column is: ' + column)
    print('Mode is: ' + df[column].mode()[0])
    print()
    df[column] = df[column].fillna( df[column].mode()[0])

df.isna().sum()

['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']
Column is: Bar
Mode is: never

Column is: CoffeeHouse
Mode is: less1

Column is: CarryAway
Mode is: 1~3

Column is: RestaurantLessThan20
Mode is: 1~3

Column is: Restaurant20To50
Mode is: less1



destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                         0
CoffeeHouse                 0
CarryAway                   0
RestaurantLessThan20        0
Restaurant20To50            0
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

## One-hot-encoding
- make a list of all categorical features
- get an overview of occurring values
- drop all binary labeled features (already OHE)
- for each feature create new columns, one for each distinct value using pd.get_dummies()<br>
'FEATURE_IS_VALUE' (e.g. 'destination_IS_Home')

In [642]:
dfOHE = df
featuresToBeOHE = df.columns.drop('temperature')

for feature in featuresToBeOHE:
    print('Current feature: ' + feature)
    valueArray = df[feature].value_counts(dropna=False).index
    for value in valueArray:
        print(value)
    print()

featuresToBeOHE = featuresToBeOHE.drop(labels=['has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'Y'])
print(featuresToBeOHE)

Current feature: destination
No Urgent Place
Home
Work

Current feature: passenger
Alone
Friend(s)
Partner
Kid(s)

Current feature: weather
Sunny
Snowy
Rainy

Current feature: time
6PM
7AM
10AM
2PM
10PM

Current feature: coupon
Coffee House
Restaurant(<20)
Carry out & Take away
Bar
Restaurant(20-50)

Current feature: expiration
1d
2h

Current feature: gender
Female
Male

Current feature: age
21
26
31
50plus
36
41
46
below21

Current feature: maritalStatus
Married partner
Single
Unmarried partner
Divorced
Widowed

Current feature: has_children
0
1

Current feature: education
Some college - no degree
Bachelors degree
Graduate degree (Masters or Doctorate)
Associates degree
High School Graduate
Some High School

Current feature: occupation
Unemployed
Student
Computer & Mathematical
Sales & Related
Education&Training&Library
Management
Office & Administrative Support
Arts Design Entertainment Sports & Media
Business & Financial
Retired
Food Preparation & Serving Related
Healthcare Practiti

In [643]:
dfOHE = pd.get_dummies(dfOHE, prefix_sep='_IS_', columns=featuresToBeOHE, dummy_na=False)
buffer = dfOHE.pop('Y')
dfOHE['Y'] = buffer
dfOHE.head()


Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,destination_IS_Home,destination_IS_No Urgent Place,destination_IS_Work,...,RestaurantLessThan20_IS_4~8,RestaurantLessThan20_IS_gt8,RestaurantLessThan20_IS_less1,RestaurantLessThan20_IS_never,Restaurant20To50_IS_1~3,Restaurant20To50_IS_4~8,Restaurant20To50_IS_gt8,Restaurant20To50_IS_less1,Restaurant20To50_IS_never,Y
0,55,1,1,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
1,80,1,1,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0
2,80,1,1,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,1
3,80,1,1,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0
4,80,1,1,1,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0


## Train-Test-Split

In [644]:
df_train, df_test = train_test_split(dfOHE, random_state=257)
X_train = df_train.drop(columns=['Y'])
y_train = df_train['Y']

df_test = df_test.dropna()
X_test = df_test.drop(columns=['Y'])
y_test = df_test['Y']

## Scaling

In [645]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Prepare Neural Network
#### Convert data to tensor

In [646]:
X_train_tensor = torch.Tensor(X_train_scaled)
X_train_tensor.shape

torch.Size([9513, 114])

In [647]:
y_train_tensor = torch.Tensor(y_train.values).long()
y_train_tensor

tensor([1, 0, 1,  ..., 1, 0, 1])

In [648]:
#y_train_reshaped = [[1-y, 0+y] for y in y_train.values]
#y_train_tensor = torch.Tensor(y_train_reshaped).long()
#y_train_tensor

In [649]:
X_test_tensor = torch.Tensor(X_test_scaled)
X_test_tensor.shape

torch.Size([3171, 114])

In [650]:
y_test_tensor = torch.Tensor(y_test.values).long()
y_test_tensor.shape

torch.Size([3171])

In [651]:
#y_test_reshaped = [[1-y, 0+y] for y in y_test.values]
#y_test_tensor = torch.Tensor(y_test_reshaped).long()
#y_test_tensor

In [652]:
training_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
training_data

<torch.utils.data.dataset.TensorDataset at 0x2250b213610>

In [653]:
test_data = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
test_data

<torch.utils.data.dataset.TensorDataset at 0x2250e071ac0>

#### Wrap data in DataLoader


In [654]:
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

for batch, (X, y) in enumerate(train_dataloader):
    print(f'batch: {batch}\n')
    print(f'X shape: {X.shape}')
    print(f'X: {X}\n')
    print(f'y shape: {y.shape}')
    print(f'y: {y}\n')
    break

batch: 0

X shape: torch.Size([64, 114])
X: tensor([[ 0.8707, -0.8485,  0.0000,  ..., -0.1435,  1.0081, -0.4495],
        [ 0.8707, -0.8485,  0.0000,  ..., -0.1435,  1.0081, -0.4495],
        [ 0.8707,  1.1786,  0.0000,  ..., -0.1435, -0.9919, -0.4495],
        ...,
        [-1.7443, -0.8485,  0.0000,  ..., -0.1435, -0.9919, -0.4495],
        [-0.4368,  1.1786,  0.0000,  ..., -0.1435,  1.0081, -0.4495],
        [ 0.8707,  1.1786,  0.0000,  ..., -0.1435, -0.9919,  2.2247]])

y shape: torch.Size([64])
y: tensor([0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1])



#### Check if Cuda is supported

In [655]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


#### Build the NN

In [656]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(114, 25),
            nn.ReLU(),
            nn.Linear(25, 25),
            nn.ReLU(),
            nn.Linear(25, 2)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=114, out_features=25, bias=True)
    (1): ReLU()
    (2): Linear(in_features=25, out_features=25, bias=True)
    (3): ReLU()
    (4): Linear(in_features=25, out_features=2, bias=True)
  )
)


## Optimizing Model Parameters
#### Hyperparameters

In [657]:
learning_rate = 1e-4
batch_size = 64
epochs = 200

loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#### Full Implementation

In [658]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('Does this ever work?')

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [659]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")


Epoch 1
-------------------------------
loss: 0.697329  [    0/ 9513]
loss: 0.685978  [ 6400/ 9513]
Test Error: 
 Accuracy: 53.7%, Avg loss: 0.010909 

Epoch 2
-------------------------------
loss: 0.702865  [    0/ 9513]
loss: 0.705709  [ 6400/ 9513]
Test Error: 
 Accuracy: 54.0%, Avg loss: 0.010903 

Epoch 3
-------------------------------
loss: 0.681500  [    0/ 9513]
loss: 0.686344  [ 6400/ 9513]
Test Error: 
 Accuracy: 54.1%, Avg loss: 0.010904 

Epoch 4
-------------------------------
loss: 0.704826  [    0/ 9513]
loss: 0.701845  [ 6400/ 9513]
Test Error: 
 Accuracy: 54.2%, Avg loss: 0.010900 

Epoch 5
-------------------------------
loss: 0.706873  [    0/ 9513]
loss: 0.704395  [ 6400/ 9513]
Test Error: 
 Accuracy: 54.2%, Avg loss: 0.010900 

Epoch 6
-------------------------------
loss: 0.698515  [    0/ 9513]
loss: 0.681491  [ 6400/ 9513]
Test Error: 
 Accuracy: 54.4%, Avg loss: 0.010897 

Epoch 7
-------------------------------
loss: 0.682889  [    0/ 9513]
loss: 0.684464  [ 