### Booking Challenge

In [214]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch

from typing import List
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm
from torchsummary import summary
from torch.nn.utils.rnn import pad_sequence

In [237]:
def preprocess_label_df(df: pd.DataFrame):
    '''
    adds label to the dataset 
    removes the last row for every trip
    removes sequences shorter than 3
    order the df starting from longer sequences
    '''
    df_chunks = []
    df['label'] = df['city_id'].shift(-1)
    idx_labels = np.unique(df.index)
    
    for idx in tqdm(idx_labels):
        temp_dataset = df.loc[idx].head(-1)
        if type(temp_dataset)==pd.DataFrame and len(temp_dataset)>=3:
            df_chunks.append(temp_dataset)
    
    df_chunks = sorted(df_chunks, key=lambda x: len(x), reverse=True)
    new_df = pd.concat(df_chunks)

    
    return new_df


def front_padding_sequence(tensors: List, num_features: int) -> torch.Tensor:
    """
    Applies front padding to a list of tensors
    """
    sizes = [len(tensor) for tensor in tensors]
    max_size = max(sizes)
    pad_tensors = []
    for q in tensors:
        new_tensor = torch.zeros(max_size, num_features)
        new_tensor[max_size-len(q):] = q
        pad_tensors.append(new_tensor)
    
    return torch.stack(pad_tensors,dim=0).long()


def custom_padding(batch):
    '''
    pad each batch according to the sequence 
    with the highest length
    '''
    features = [sample[0] for sample in batch]
    features = front_padding_sequence(features, 2)
    labels = [sample[1].unsqueeze(dim=1) for sample in batch]
    labels = front_padding_sequence(labels, 1)
    
    return features, labels

In [197]:
df_train = pd.read_csv('train_set.csv').set_index('utrip_id')
df_train.index = df_train.index.astype(int)
df_test = pd.read_csv('test_set.csv').set_index('utrip_id')
df_test.index = df_test.index.astype(int)

In [198]:
df_train = preprocess_label_df(df_train)

  0%|          | 0/217684 [00:00<?, ?it/s]

In [199]:
df_test = preprocess_label_df(df_test)

  0%|          | 0/70661 [00:00<?, ?it/s]

### Change utrip_id label

In [200]:
df_train['label'] = df_train['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

In [201]:
utrip_distinct_train = df_train.index.unique()
utrip_train_map = {utrip:i for i, utrip in enumerate(utrip_distinct_train)}
df_train = df_train.rename(index=utrip_train_map)

utrip_distinct_test = df_test.index.unique()
utrip_test_map = {utrip:i for i, utrip in enumerate(utrip_distinct_test)}
df_test = df_test.rename(index=utrip_test_map)

In [8]:
df_train.head()

Unnamed: 0_level_0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,label
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3635431,2016-04-01,2016-04-02,47319,mobile,9924,Gondal,Gondal,36063
0,3635431,2016-04-02,2016-04-03,36063,mobile,9924,Gondal,Gondal,36063
0,3635431,2016-04-03,2016-04-04,36063,mobile,384,Gondal,Gondal,36063
0,3635431,2016-04-04,2016-04-05,36063,mobile,9924,Gondal,Gondal,3109
0,3635431,2016-04-05,2016-04-06,3109,mobile,9924,Gondal,Gondal,3109


#### Create map for data

In [202]:
distinct_affiliate_id = np.unique(df_train.affiliate_id)
affiliate_id_map = {aff_id:i+1 for i, aff_id in enumerate(distinct_affiliate_id)}

distinct_checkin_id = np.unique(df_train.checkin)
checkin_map = {checkin:i+1 for i, checkin in enumerate(distinct_checkin_id)}

distinct_device_class = np.unique(df_train.device_class)
device_map = {device:i+1 for i, device in enumerate(distinct_device_class)}

distinct_city_id = np.unique([df_train.city_id,df_train.label])
city_id_map = {city_id:i+1 for i, city_id in enumerate(distinct_city_id)}

distinct_booker_country = np.unique(df_train.booker_country)
booker_country_map = {booker_country:i+1 for i, booker_country in enumerate(distinct_booker_country)}

distinct_hotel_country = np.unique(df_train.hotel_country)
hotel_country_map = {hotel_country:i+1 for i, hotel_country in enumerate(distinct_hotel_country)}

In [203]:
df_train['checkin'] = df_train['checkin'].map(lambda x: checkin_map.get(x))
df_train['affiliate_id'] = df_train['affiliate_id'].map(lambda x: affiliate_id_map.get(x))
df_train['city_id'] = df_train['city_id'].map(lambda x: city_id_map.get(x))
df_train['label'] = df_train['label'].map(lambda x: city_id_map.get(x))
df_train['booker_country'] = df_train['booker_country'].map(lambda x: booker_country_map.get(x))

In [204]:
df_test['checkin'] = df_test['checkin'].map(lambda x: checkin_map.get(x, 0))
df_test['affiliate_id'] = df_test['affiliate_id'].map(lambda x: affiliate_id_map.get(x, 0))
df_test['city_id'] = df_test['city_id'].map(lambda x: city_id_map.get(x, 0))
df_test['label'] = df_test['label'].map(lambda x: city_id_map.get(x, 0))
df_test['booker_country'] = df_test['booker_country'].map(lambda x: booker_country_map.get(x, 0))

#### Create a dataset

In [238]:
class BookingDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return max(self.df.index)
    
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        city_id = row['city_id'].values
        label_id = row['label'].values
        affiliate_id = row['affiliate_id']
        
        city_id = torch.tensor(row['city_id'].values).view(-1, 1)
        affiliate_id = torch.tensor(row['affiliate_id'].values).view(-1, 1)
        features = torch.cat((city_id, affiliate_id), 1)
        features = features.view(-1, 2)
        
        label_id = torch.tensor(row['label'].values).float()
        
        return features, label_id

In [239]:
train_ds = BookingDataset(df_train)
train_dl = DataLoader(train_ds, batch_size = 4, shuffle = False, collate_fn=custom_padding)

#### Vanilla RNN

In [189]:
# Vanilla RNN using nn.RNN
class Vanilla_RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size):
        super(Vanilla_RNN, self).__init__()
        
        self.city_emb = nn.Embedding(len(city_id_map)+1, emb_size, padding_idx=0)
        self.affiliate_emb = nn.Embedding(len(affiliate_id_map)+1, emb_size, padding_idx=0)
        
        self.rnn = nn.RNN(emb_size*2, hidden_size)
        
        # here is our g function from the lecture slides
        # linear layer turning the i-th hidden state into the i-th output
        self.g = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        city_emb = self.city_emb(x[:,:,0])
        affiliate_emb = self.affiliate_emb(x[:,:,1])
        x = torch.cat((city_emb, affiliate_emb), dim=2)
        out, hidden = self.rnn(x)
        out = self.g(out)

        return out, hidden

In [190]:
v_rnn = Vanilla_RNN(24, 100, 50)