### Booking Challenge

In [13]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch

from typing import List
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm
from torchsummary import summary
from torch.nn.utils.rnn import pad_sequence

In [14]:
torch.cuda.get_device_name(0)

'Tesla T4'

In [15]:
def preprocess_label_df(
        df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    '''
    adds label to the dataset 
    removes the last row for every trip
    removes sequences shorter than 3
    order the df starting from longer sequences
    '''
    df_chunks = []
    df['label'] = df['city_id'].shift(-1)
    idx_labels = np.unique(df.index)

    for idx in tqdm(idx_labels):
        temp_dataset = df.loc[idx].head(-1)
        if type(temp_dataset) == pd.DataFrame and len(temp_dataset) >= 3:
            df_chunks.append(temp_dataset)

    df_chunks = sorted(df_chunks, key=lambda x: len(x), reverse=True)
    new_df = pd.concat(df_chunks)

    return new_df


def front_padding_sequence(tensors: List, num_features: int) -> torch.Tensor:
    """
    Applies front padding to a list of tensors
    """
    sizes = [len(tensor) for tensor in tensors]
    max_size = max(sizes)
    pad_tensors = []
    for q in tensors:
        new_tensor = torch.zeros(max_size, num_features)
        new_tensor[max_size - len(q):] = q
        pad_tensors.append(new_tensor)

    return torch.stack(pad_tensors, dim=0).long()


def custom_padding(batch):
    '''
    pad each batch according to the sequence 
    with the highest length
    '''
    features = [sample[0] for sample in batch]
    features = front_padding_sequence(features, 2)
    labels = [sample[1].unsqueeze(dim=1) for sample in batch]
    labels = front_padding_sequence(labels, 1)

    return features, labels

In [29]:
df_train = pd.read_csv('/content/sample_data/train_set.csv').set_index('utrip_id')
df_train.index = df_train.index.astype(int)
df_test = pd.read_csv('/content/sample_data/test_set.csv').set_index('utrip_id')
df_ground_truth = pd.read_csv('/content/sample_data/ground_truth.csv').set_index('utrip_id')
df_test = df_test.merge(df_ground_truth, left_on='utrip_id', right_on='utrip_id')
df_test.index = df_test.index.astype(int)
df_test['city_id'] = np.where(df_test['city_id_x'] == 0, df_test['city_id_y'], df_test['city_id_x'])
df_test = df_test[['user_id','checkin','checkout','device_class','affiliate_id','booker_country', 'city_id']]

In [30]:
df_train = preprocess_label_df(df_train)

HBox(children=(FloatProgress(value=0.0, max=217684.0), HTML(value='')))




In [31]:
df_test = preprocess_label_df(df_test)

HBox(children=(FloatProgress(value=0.0, max=70661.0), HTML(value='')))




### Change utrip_id label

In [32]:
df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

In [33]:
utrip_distinct_train = df_train.index.unique()
utrip_train_map = {utrip:i for i, utrip in enumerate(utrip_distinct_train)}
df_train = df_train.rename(index=utrip_train_map)

utrip_distinct_test = df_test.index.unique()
utrip_test_map = {utrip:i for i, utrip in enumerate(utrip_distinct_test)}
df_test = df_test.rename(index=utrip_test_map)

In [34]:
df_train.head()

Unnamed: 0_level_0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,label
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3635431,2016-04-01,2016-04-02,47319,mobile,9924,Gondal,Gondal,36063
0,3635431,2016-04-02,2016-04-03,36063,mobile,9924,Gondal,Gondal,36063
0,3635431,2016-04-03,2016-04-04,36063,mobile,384,Gondal,Gondal,36063
0,3635431,2016-04-04,2016-04-05,36063,mobile,9924,Gondal,Gondal,3109
0,3635431,2016-04-05,2016-04-06,3109,mobile,9924,Gondal,Gondal,3109


#### Create map for data

In [36]:
distinct_affiliate_id = np.unique(df_train.affiliate_id)
affiliate_id_map = {aff_id:i+1 for i, aff_id in enumerate(distinct_affiliate_id)}

distinct_checkin_id = np.unique(df_train.checkin)
checkin_map = {checkin:i+1 for i, checkin in enumerate(distinct_checkin_id)}

distinct_device_class = np.unique(df_train.device_class)
device_map = {device:i+1 for i, device in enumerate(distinct_device_class)}

distinct_city_id = np.unique([df_train.city_id,df_train.label])
city_id_map = {city_id:i+1 for i, city_id in enumerate(distinct_city_id)}

distinct_booker_country = np.unique(df_train.booker_country)
booker_country_map = {booker_country:i+1 for i, booker_country in enumerate(distinct_booker_country)}

distinct_hotel_country = np.unique(df_train.hotel_country.astype(str))
hotel_country_map = {hotel_country:i+1 for i, hotel_country in enumerate(distinct_hotel_country)}

In [37]:
df_train['checkin'] = df_train['checkin'].map(lambda x: checkin_map.get(x))
df_train['affiliate_id'] = df_train['affiliate_id'].map(lambda x: affiliate_id_map.get(x))
df_train['city_id'] = df_train['city_id'].map(lambda x: city_id_map.get(x))
df_train['label'] = df_train['label'].map(lambda x: city_id_map.get(x))
df_train['booker_country'] = df_train['booker_country'].map(lambda x: booker_country_map.get(x))

In [38]:
df_test['checkin'] = df_test['checkin'].map(lambda x: checkin_map.get(x, 0))
df_test['affiliate_id'] = df_test['affiliate_id'].map(lambda x: affiliate_id_map.get(x, 0))
df_test['city_id'] = df_test['city_id'].map(lambda x: city_id_map.get(x, 0))
df_test['label'] = df_test['label'].map(lambda x: city_id_map.get(x, 0))
df_test['booker_country'] = df_test['booker_country'].map(lambda x: booker_country_map.get(x, 0))

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

#### Create a dataset

In [40]:
class BookingDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return max(self.df.index)
    
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        city_id = row['city_id'].values
        label_id = row['label'].values
        affiliate_id = row['affiliate_id']
        
        city_id = torch.tensor(row['city_id'].values).view(-1, 1)
        affiliate_id = torch.tensor(row['affiliate_id'].values).view(-1, 1)
        features = torch.cat((city_id, affiliate_id), 1)
        features = features.view(-1, 2)
        
        label_id = torch.tensor(row['label'].values).float()
        
        return features, label_id

In [41]:
train_ds = BookingDataset(df_train)
train_dl = DataLoader(train_ds, batch_size = 4, shuffle = False, collate_fn=custom_padding)
test_ds = BookingDataset(df_test)
test_dl = DataLoader(test_ds, batch_size = 4, shuffle = False, collate_fn=custom_padding)

#### Vanilla RNN

In [None]:
# Vanilla RNN using nn.RNN
class Vanilla_RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size):
        super(Vanilla_RNN, self).__init__()
        
        self.city_emb = nn.Embedding(len(city_id_map)+1, emb_size, padding_idx=0)
        self.affiliate_emb = nn.Embedding(len(affiliate_id_map)+1, emb_size, padding_idx=0)
        
        self.rnn = nn.RNN(emb_size*2, hidden_size)
        
        # here is our g function from the lecture slides
        # linear layer turning the i-th hidden state into the i-th output
        self.g = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        city_emb = self.city_emb(x[:,:,0])
        affiliate_emb = self.affiliate_emb(x[:,:,1])
        x = torch.cat((city_emb, affiliate_emb), dim=2)
        out, hidden = self.rnn(x)
        out = self.g(out)

        return out, hidden

In [None]:
v_rnn = Vanilla_RNN(24, 100, 50)

#### LSTM

In [42]:
class LSTM(nn.Module):
    def __init__(self, emb_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.city_emb = nn.Embedding(len(city_id_map)+1, emb_size, padding_idx=0)
        self.affiliate_emb = nn.Embedding(len(affiliate_id_map)+1, emb_size, padding_idx=0)
        self.lstm = nn.LSTM(emb_size*2, hidden_size, batch_first=True)
        self.g = nn.Linear(hidden_size, len(city_id_map)+1)
        
    def forward(self, x):
        city_emb = self.city_emb(x[:,:,0])
        affiliate_emb = self.affiliate_emb(x[:,:,1])
        x = torch.cat((city_emb, affiliate_emb), dim=2)
        out, hidden = self.lstm(x)
        out = self.g(out) 
        return out, hidden

In [43]:
lstm = LSTM(50, 100)
lstm.to(device)

LSTM(
  (city_emb): Embedding(39879, 50, padding_idx=0)
  (affiliate_emb): Embedding(3089, 50, padding_idx=0)
  (lstm): LSTM(100, 100, batch_first=True)
  (g): Linear(in_features=100, out_features=39879, bias=True)
)

In [44]:
loss_fun = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(lstm.parameters(), lr = 0.01)

In [45]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):

    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        output, hidden = model(x)
        if output.shape[1] >= 3:
          output = output[:, 2:, :]
          y_ = y[:, 2:, :]
          output = output.transpose(1,2)
          loss = lossFun(output, y_.squeeze(2))
          total_loss += loss.item()
        else:
          continue
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

In [46]:
def one_pass_acc4(model, dataloader, num_points, k=4, is_train=True):
    model.eval()
    total_correct = 0
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)
        output, hidden = model(x)
        # print(y.shape)
        # print(output.shape)
        if is_train:
          output = output[:,2:,:]
          y_ = y[:, 2:,: ]
        else:
          output = output[:,-1:,:]
          y_ = y[:, -1:, :]
        # print(output.shape)
        result = torch.topk(output, 4)
        idx = result.indices
        matches = (torch.eq(idx, y_).sum(2)>0).sum(1)>0
        correct = matches.sum().item()
        # print("output shape ", output.shape, "y shape", y.shape)
        # print("Printing output", output)
        # print("Printing y shape", y.shape)
        # print(f"matches = ", matches)
        # print(f"Matches shape = {matches.shape}")
        total_correct += correct
        # print("Correct = ", correct)
        # print(len(y))
        # break
    return (total_correct/num_points)

In [47]:
num_epochs = 10
for epoch in range(num_epochs):
  print(f"Epoch {epoch+1}")
  train_loss = one_pass(lstm, train_dl, optimizer, loss_fun)
  print(f"Train Loss: {train_loss:.4f}")
  test_loss = one_pass(lstm, test_dl, optimizer, loss_fun, backwards=False)
  print(f"Test Loss: {test_loss:.4f}")
  train_acc = one_pass_acc4(lstm, train_dl, len(train_ds), k=4)
  print(f"Train Accuracy: {train_acc:.4f}")
  test_acc = one_pass_acc4(lstm, test_dl, len(test_ds), k=4, is_train=False)
  print(f"Test Accuracy: {test_acc}")

Epoch 1


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 6.6431


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 7.4491


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4535


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.35032550240588733
Epoch 2


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 6.9166


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 7.6601


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4633


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.3549957543164449
Epoch 3


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.0587


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 7.7850


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4790


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.367378998018681
Epoch 4


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.1746


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 8.0056


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4852


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.3720209453722049
Epoch 5


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.2669


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 8.2457


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4825


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.36967166713840927
Epoch 6


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.4102


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 8.4684


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4792


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.36763373903198415
Epoch 7


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.4949


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 8.6594


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4881


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.3774412680441551
Epoch 8


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.6276


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 8.8727


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4817


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.3705774129634871
Epoch 9


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.7332


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 9.1775


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4771


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.3696292103028588
Epoch 10


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Loss: 7.8601


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Loss: 9.3561


HBox(children=(FloatProgress(value=0.0, max=54201.0), HTML(value='')))


Train Accuracy: 0.4816


HBox(children=(FloatProgress(value=0.0, max=17665.0), HTML(value='')))


Test Accuracy: 0.370676478913105
