In [24]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset

In [25]:
train_set = pd.read_csv('./dataset/training_set_VU_DM.csv')
test_set = pd.read_csv('./dataset/test_set_VU_DM.csv')

train_set = train_set.drop(columns=["date_time", 
                                    "visitor_hist_starrating", 
                                    "visitor_hist_adr_usd",
                                    "srch_query_affinity_score",
                                    "orig_destination_distance",
                                    "comp1_rate", "comp1_inv", "comp1_rate_percent_diff",
                                    "comp2_rate", "comp2_inv", "comp2_rate_percent_diff",
                                    "comp3_rate", "comp3_inv", "comp3_rate_percent_diff",
                                    "comp4_rate", "comp4_inv", "comp4_rate_percent_diff",
                                    "comp5_rate", "comp5_inv", "comp5_rate_percent_diff",
                                    "comp6_rate", "comp6_inv", "comp6_rate_percent_diff",
                                    "comp7_rate", "comp7_inv", "comp7_rate_percent_diff",
                                    "comp8_rate", "comp8_inv", "comp8_rate_percent_diff",
                                    "gross_bookings_usd"])

test_set = test_set.drop(columns=["date_time", 
                                    "visitor_hist_starrating", 
                                    "visitor_hist_adr_usd",
                                    "srch_query_affinity_score",
                                    "orig_destination_distance",
                                    "comp1_rate", "comp1_inv", "comp1_rate_percent_diff",
                                    "comp2_rate", "comp2_inv", "comp2_rate_percent_diff",
                                    "comp3_rate", "comp3_inv", "comp3_rate_percent_diff",
                                    "comp4_rate", "comp4_inv", "comp4_rate_percent_diff",
                                    "comp5_rate", "comp5_inv", "comp5_rate_percent_diff",
                                    "comp6_rate", "comp6_inv", "comp6_rate_percent_diff",
                                    "comp7_rate", "comp7_inv", "comp7_rate_percent_diff",
                                    "comp8_rate", "comp8_inv", "comp8_rate_percent_diff"])

In [32]:
# Training set
train_set_Y = train_set[["click_bool", "booking_bool"]]
train_set_X = train_set.drop(columns=["click_bool", "booking_bool"])

X_array = np.array(train_set_X, dtype=np.float32)
Y_array = np.array(train_set_Y, dtype=np.float32)

X_tensor = torch.tensor(X_array)
Y_tensor = torch.tensor(Y_array)

X_tensor = torch.where(torch.isnan(X_tensor), torch.zeros_like(X_tensor), X_tensor)
Y_tensor = torch.where(torch.isnan(Y_tensor), torch.zeros_like(Y_tensor), Y_tensor)

dataset = TensorDataset(X_tensor, Y_tensor)

train_size = int(0.75*len(X_tensor))
test_size = len(X_tensor) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=500, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=500, shuffle=False)

print(X_tensor.shape)
print(Y_tensor.shape)

in_size = X_tensor.size(dim=1)
out_size = Y_tensor.size(dim=1)


torch.Size([4958347, 22])
torch.Size([4958347, 2])


In [33]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_size, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, out_size),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork()
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=22, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=2, bias=True)
  )
)


In [34]:
def training_loop(n_epochs, nn, optimiser, loss_fn, train_loader, test_loader):
    
    for epoch in range(n_epochs):
        print("Epoch: "+str(epoch))
        outputs_return = []
        labels_return = []
        for inputs, label in train_loader:
            # Training mode
            nn.train() 
            # Reset gradients
            optimiser.zero_grad()
            # Forward propagation
            outputs = nn(inputs)
            # Training loss
            loss = loss_fn(outputs, label) 
            # Backpropagate
            loss.backward()
            # Update weights
            torch.nn.utils.clip_grad_norm_(nn.parameters(), max_norm=2)
            optimiser.step() 

        # Evaluation mode
        nn.eval() 
        # Disable gradient calc
        with torch.no_grad():
            test_loss = 0.0
            # Compute classes and losses
            for inputs, labels in test_loader:
                outputs = nn(inputs)
                outputs_return.append(outputs)
                labels_return.append(labels)
                loss = loss_fn(outputs, labels)
                test_loss += loss.item() * inputs.size(0)
            
        test_loss /= len(test_loader.dataset)
        print(f"Epoch: {epoch}, train loss: {loss.item():.5f}, test loss: {test_loss:.5f}")
            #print("Result std: "+str(np.std(np.array((10-1)*y_test+1))))
            #print("Prediction std: "+str(np.std(np.array((10-1)*test_preds+1))))

    return outputs_return, labels_return

In [35]:
loss = nn.MSELoss()
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
n_epochs = 100

prediction, result = training_loop(
    n_epochs=n_epochs,
    nn=model, 
    optimiser=optimizer, 
    loss_fn=loss, 
    train_loader=train_loader, 
    test_loader=test_loader
)

Epoch: 0
Epoch: 0, train loss: 0.01231, test loss: 0.03512
Epoch: 1
Epoch: 1, train loss: 0.01191, test loss: 0.03511
Epoch: 2
Epoch: 2, train loss: 0.01218, test loss: 0.03511
Epoch: 3
Epoch: 3, train loss: 0.01197, test loss: 0.03511
Epoch: 4
Epoch: 4, train loss: 0.01230, test loss: 0.03512
Epoch: 5
Epoch: 5, train loss: 0.01213, test loss: 0.03511
Epoch: 6
Epoch: 6, train loss: 0.01167, test loss: 0.03519
Epoch: 7
Epoch: 7, train loss: 0.01177, test loss: 0.03514
Epoch: 8
Epoch: 8, train loss: 0.01203, test loss: 0.03510
Epoch: 9
Epoch: 9, train loss: 0.01209, test loss: 0.03510
Epoch: 10
Epoch: 10, train loss: 0.01196, test loss: 0.03511
Epoch: 11
Epoch: 11, train loss: 0.01240, test loss: 0.03514
Epoch: 12
Epoch: 12, train loss: 0.01192, test loss: 0.03511
Epoch: 13
Epoch: 13, train loss: 0.01211, test loss: 0.03510
Epoch: 14
Epoch: 14, train loss: 0.01216, test loss: 0.03511
Epoch: 15
Epoch: 15, train loss: 0.01207, test loss: 0.03510
Epoch: 16
Epoch: 16, train loss: 0.01187, te