## Load data

In [24]:
import pandas as pd

In [25]:
new_data = pd.read_csv("feature_eng_cat.csv")
new_data.drop(["Unnamed: 0"],axis=1,inplace=True)

In [26]:
new_data.columns

Index(['LEN', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B',
       'CALL_TYPE_C', 'MORNING', 'AFTERNOON', 'EVENING', 'NIGHT', 'S1', 'S2',
       'S3', 'S4', 'YR_DUMMY', 'CALL_A_MORN', 'CALL_A_AFTER', 'CALL_A_EVE',
       'CALL_A_NIT', 'CALL_B_MORN', 'CALL_B_AFTER', 'CALL_B_EVE', 'CALL_B_NIT',
       'CALL_C_MORN', 'CALL_C_AFTER', 'CALL_C_EVE', 'CALL_C_NIT', 'CALL_A_S1',
       'CALL_A_S2', 'CALL_A_S3', 'CALL_A_S4', 'CALL_B_S1', 'CALL_B_S2',
       'CALL_B_S3', 'CALL_B_S4', 'CALL_C_S1', 'CALL_C_S2', 'CALL_C_S3',
       'CALL_C_S4', 'TAXI_ID_CAT', 'ORIGIN_CALL_CAT', 'ORIGIN_STAND_CAT',
       'DISTANCE'],
      dtype='object')

## Split train and test set

In [27]:
X = new_data.drop(['LEN'],axis=1)
y = new_data['LEN']

In [28]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()
one_hot.fit(X[[ 'MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT']].values)

one_hot_output = one_hot.transform(X[['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT']].values)


columns=[f"MON_{i}" for i in range(1,13)] + [f"DAT_{i}" for i in range(1,32)] + [f"HR_{i}" for i in range(24)] + [f"WK_{i}" for i in range(7)] + [f"TAXI_{i}" for i in range(442)] + [f"STAND_{i}" for i in range(64)] + [f"CALL_{i}" for i in range(254)]

one_hot_frame = pd.DataFrame(one_hot_output.toarray(),columns=columns)
df_processed = X.join(one_hot_frame)

df_processed.drop(['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT'],axis=1,inplace=True)

In [29]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df_processed,y,test_size=0.2)

## Apply Deep Learning Model

In [30]:
import os
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch.optim as optim

* Load Dataset

In [31]:
category_names = ['CALL_TYPE_A', 'CALL_TYPE_B',
       'CALL_TYPE_C', 'DAY_TYPE_A', 'DAY_TYPE_B', 'DAY_TYPE_C', 'MISSING',
       'CALL_A_DAY_A', 'CALL_A_DAY_B', 'CALL_A_DAY_C', 'CALL_B_DAY_A',
       'CALL_B_DAY_B', 'CALL_B_DAY_C', 'MORNING', 'AFTERNOON', 'EVENING',
       'NIGHT', 'S1', 'S2', 'S3', 'S4', 'YR_DUMMY',
       'CALL_A_MORN', 'CALL_A_AFTER', 'CALL_A_EVE', 'CALL_A_NIT',
       'CALL_B_MORN', 'CALL_B_AFTER', 'CALL_B_EVE', 'CALL_B_NIT',
       'CALL_C_MORN', 'CALL_C_AFTER', 'CALL_C_EVE', 'CALL_C_NIT', 'CALL_A_S1',
       'CALL_A_S2', 'CALL_A_S3', 'CALL_A_S4', 'CALL_B_S1', 'CALL_B_S2',
       'CALL_B_S3', 'CALL_B_S4', 'CALL_C_S1', 'CALL_C_S2', 'CALL_C_S3',
       'CALL_C_S4', 'DAY_A_MORN', 'DAY_A_AFTER', 'DAY_A_EVE', 'DAY_A_NIT',
       'DAY_B_MORN', 'DAY_B_AFTER', 'DAY_B_EVE', 'DAY_B_NIT', 'DAY_C_MORN',
       'DAY_C_AFTER', 'DAY_C_EVE', 'DAY_C_NIT', 'DAY_A_S1', 'DAY_A_S2',
       'DAY_A_S3', 'DAY_A_S4', 'DAY_B_S1', 'DAY_B_S2', 'DAY_B_S3', 'DAY_B_S4',
       'DAY_C_S1', 'DAY_C_S2', 'DAY_C_S3', 'DAY_C_S4', 'CALL_A_MISS',
       'CALL_B_MISS', 'CALL_C_MISS', 'DAY_A_MISS', 'DAY_B_MISS', 'DAY_C_MISS','TAXI_ID_CAT','ORIGIN_CALL_CAT', 'ORIGIN_STAND_CAT']
continous_names = ['HR_SIN', 'HR_COS', 'DAY_SIN','DAY_COS', 'WK_SIN', 'WK_COS', 'MON_SIN', 'MON_COS']

In [32]:
# train_dataset, test_dataset = TensorDataset(torch.from_numpy(x_train[category_names].values.astype(np.compat.long)),
#                                             torch.from_numpy(x_train[continous_names].values.astype(np.float32)),
#                                             torch.from_numpy(y_train.values)), \
#                               TensorDataset(torch.from_numpy(x_test[category_names].values.astype(np.compat.long)),
#                                             torch.from_numpy(x_test[continous_names].values.astype(np.float32)),
#                                             torch.from_numpy(y_test.values))


train_dataset, test_dataset = TensorDataset(torch.from_numpy(x_train.values.astype(np.float32)),
                                            torch.from_numpy(y_train.values)), \
                              TensorDataset(torch.from_numpy(x_test.values.astype(np.float32)),
                                            torch.from_numpy(y_test.values))              


In [33]:
train_dataloader, test_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True), \
                                                      DataLoader(test_dataset, batch_size=2048, shuffle=False)

In [39]:
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self,categories=0,num_continuous=0,dims=64,num_special_tokens=2):
        super(MLP,self).__init__()

        # self.num_unique_categories = sum(categories)
        # total_tokens = self.num_unique_categories + num_special_tokens
        # # for automatically offsetting unique category ids to the correct position in the categories embedding table

        # if self.num_unique_categories > 0:
        #     categories_offset = F.pad(torch.tensor(list(categories)), (1, 0), value = num_special_tokens)
        #     categories_offset = categories_offset.cumsum(dim = -1)[:-1]
        #     self.register_buffer('categories_offset', categories_offset)

        #     # categorical embedding

        #     self.categorical_embeds = nn.Embedding(total_tokens, dims)

        # self.fc1 = nn.Linear(dims*len(categories)+num_continuous,2048)
        self.fc1 = nn.Linear(871,2048)
        self.fc2 = nn.Linear(2048,1024)
        self.fc3 = nn.Linear(1024,512)
        self.fc4 = nn.Linear(512,128)
        self.fc5 = nn.Linear(128,32)
        self.fc6 = nn.Linear(32,1)
        self.relu = nn.LeakyReLU()


    def forward(self,x):

        # x = self.categorical_embeds(x).reshape(x.shape[0],-1)
        # if len(x.shape) == 1:
        #     x = torch.concat([x,x1.reshape(1,-1)],dim=1)
        # else:
        #     x = torch.concat([x,x1],dim=1)
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)
        x = self.fc5(x)
        x = self.relu(x)
        x = self.fc6(x)
        return x

In [35]:
def train(model,optimizer,lr_scheduler,train_loader,val_loader,epochs,output_path,device):
    model = model.to(device)

    loss_fn = nn.MSELoss()
    best_loss = np.inf
    with tqdm(total=epochs, desc=f'Training', postfix=dict, mininterval=0.3) as pbar:
        total_avg_train_loss = []
        total_avg_val_loss = []
        for epoch in range(epochs):
            train_total_loss = []
            model.train()
            for i,(x,label) in enumerate(train_loader):
                x = x.to(device)
                # x1 = x1.long().to(device)
                label = label.to(device)
                model_output = model(x).squeeze(1)

                loss = loss_fn(model_output,label.float())

                optimizer.zero_grad()
                loss.backward()
                # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=5, norm_type=2)
                optimizer.step()
                train_total_loss.append(loss.item())

            avg_train_loss = np.mean(train_total_loss)


            val_total_loss = []
            model.eval()
            for i,(x,label) in enumerate(val_loader):
                x = x.to(device)
                # x1 = x1.long().to(device)
                label = label.to(device)
                model_output = model(x).squeeze(1)
                loss = loss_fn(model_output,label.float())


                val_total_loss.append(loss.item())


            avg_val_loss = np.mean(val_total_loss)


            pbar.set_postfix(**{'train_rmse_loss': round(np.sqrt(avg_train_loss),5),
                                'val_rmse_loss': round(np.sqrt(avg_val_loss),5)})
            pbar.update(1)

            if best_loss > avg_val_loss:
                best_loss = avg_val_loss
                save_name = os.path.join(output_path,f"Taxi_MLP_Epoch{epoch}_rmse{round(np.sqrt(avg_val_loss),5)}_withid.pth")
                torch.save(model.state_dict(),save_name)

            lr_scheduler.step()

            total_avg_train_loss.append(avg_train_loss)
            total_avg_val_loss.append(avg_val_loss)
    return total_avg_train_loss,total_avg_val_loss

In [40]:
from fvcore.nn import FlopCountAnalysis, parameter_count_table

# categories =  [2] * (len(category_names)-3) + [448] + [57106] + [64]
# num_continuous = len(continous_names)

# model = MLP(categories=categories,num_continuous=num_continuous)
model = MLP()
for x,y in train_dataloader:
    print(x.shape)
    dummy_input = (x)
    break

# FLOPs
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs: ", flops.total()/1e9)

# parameters
print(parameter_count_table(model))

Unsupported operator aten::leaky_relu encountered 5 time(s)


torch.Size([2048, 871])
FLOPs:  9.164619776
| name         | #elements or shape   |
|:-------------|:---------------------|
| model        | 4.5M                 |
|  fc1         |  1.8M                |
|   fc1.weight |   (2048, 871)        |
|   fc1.bias   |   (2048,)            |
|  fc2         |  2.1M                |
|   fc2.weight |   (1024, 2048)       |
|   fc2.bias   |   (1024,)            |
|  fc3         |  0.5M                |
|   fc3.weight |   (512, 1024)        |
|   fc3.bias   |   (512,)             |
|  fc4         |  65.7K               |
|   fc4.weight |   (128, 512)         |
|   fc4.bias   |   (128,)             |
|  fc5         |  4.1K                |
|   fc5.weight |   (32, 128)          |
|   fc5.bias   |   (32,)              |
|  fc6         |  33                  |
|   fc6.weight |   (1, 32)            |
|   fc6.bias   |   (1,)               |


In [41]:

# categoryies.insert(0,448)

device = torch.device("cuda:0")

optimizer = optim.Adam(model.parameters(), lr=0.001,weight_decay=0.003)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.92)

epochs = 50
output_path = "./model/"

resume = False
if resume:
    ckpt = torch.load("/data/aaron/Homework/Taxi/model/Taxi_MLP_Epoch9_rmse646.63632_withid.pth")
    model.load_state_dict(ckpt)

train_loss,val_loss = train(model,optimizer,lr_scheduler,train_dataloader,test_dataloader,epochs,output_path,device)

Training:  12%|█▏        | 6/50 [03:15<23:51, 32.54s/it, train_rmse_loss=630, val_rmse_loss=632]


KeyboardInterrupt: 

## Test data

In [42]:
new_data = pd.read_csv("test_public_features_cat.csv")
new_data.drop(["Unnamed: 0"],axis=1,inplace=True)

In [43]:
one_hot_output = one_hot.transform(X[['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT']].values)
one_hot_frame = pd.DataFrame(one_hot_output.toarray(),columns=columns)

test_data = new_data.join(one_hot_frame)
test_data.drop(['MON','DAY', 'HR', 'WK','TAXI_ID_CAT','ORIGIN_STAND_CAT','ORIGIN_CALL_CAT'],axis=1,inplace=True)

In [49]:
# predict the results
ckpt = torch.load("/data/aaron/Homework/Taxi/model/Taxi_MLP_Epoch4_rmse630.45091_withid.pth")
# model = MLP()
# model.load_state_dict(ckpt)
model.to(device)
model.eval()
outputs = []
with torch.no_grad():
    for i in range(320):
        data = test_data.iloc[i,:]
        x = torch.from_numpy(data.values.astype(np.float32)).reshape(1,-1)
        x = x.to(device).reshape(1,-1)
        print(x.shape)
        output = model(x).squeeze(1)
        outputs.append(output.detach().cpu().item())

torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1, 871])
torch.Size([1

In [None]:
# load sample

sample = pd.read_csv("/data/aaron/Homework/Taxi/sample_xgboost.csv",index_col="TRIP_ID")
# sample.drop(["Unnamed: 0"],axis=1,inplace=True)

sample['TRAVEL_TIME'] = outputs

sample.to_csv("mlp_public_new.csv")