In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
from torch import nn
import seaborn as sns
from sklearn import preprocessing
from scipy.special import boxcox,inv_boxcox

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_public.csv')
print(train_data.shape)
print(test_data.shape)

(1710670, 9)
(320, 8)


In [3]:
#We delete the rows with missing data
train_data = train_data.drop(train_data[train_data.MISSING_DATA == True].index)
print(train_data.shape)
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
print(all_features.shape)

(1710660, 9)
(1710980, 7)


In [4]:
all_features.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA
0,C,,,20000589,1372636858,A,False
1,B,,7.0,20000596,1372637303,A,False
2,C,,,20000320,1372636951,A,False
3,C,,,20000520,1372636854,A,False
4,C,,,20000337,1372637091,A,False


In [5]:
#standardize numeric entry values, parse taxi-id into strings
all_features['TAXI_ID'] = all_features['TAXI_ID'].astype(str)
all_features['ORIGIN_STAND'] = all_features['ORIGIN_STAND'].astype(str)
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA
0,C,,,20000589,-1.740697,A,
1,B,,7.0,20000596,-1.740649,A,
2,C,,,20000320,-1.740687,A,
3,C,,,20000520,-1.740697,A,
4,C,,,20000337,-1.740672,A,


In [6]:
#create one-hot columns for discrete values
all_features = pd.get_dummies(all_features, dummy_na=False)
all_features.head()

Unnamed: 0,ORIGIN_CALL,TIMESTAMP,MISSING_DATA,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,ORIGIN_STAND_1.0,ORIGIN_STAND_10.0,ORIGIN_STAND_11.0,ORIGIN_STAND_12.0,...,TAXI_ID_20000904,TAXI_ID_20000911,TAXI_ID_20000931,TAXI_ID_20000940,TAXI_ID_20000941,TAXI_ID_20000969,TAXI_ID_20000970,TAXI_ID_20000980,TAXI_ID_20000981,DAY_TYPE_A
0,,-1.740697,,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,,-1.740649,,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,,-1.740687,,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,,-1.740697,,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,,-1.740672,,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
#It seems that all data's day types are A, so we remove the last column; we also delete missing_data column as it is not useful
#Fill NaN values with 0
all_features = all_features.drop(['MISSING_DATA','DAY_TYPE_A','ORIGIN_CALL'],axis=1)
all_features = all_features.fillna(0)
all_features.head()
#def draw_corr_picture(X):
#    corrmat = X.corr()
#    plt.subplots(figsize=(12,12))
#    sns.heatmap(corrmat,vmax=0.9,square=True,cmap='Blues')
#    plt.show()

#draw_corr_picture(all_features)

Unnamed: 0,TIMESTAMP,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,ORIGIN_STAND_1.0,ORIGIN_STAND_10.0,ORIGIN_STAND_11.0,ORIGIN_STAND_12.0,ORIGIN_STAND_13.0,ORIGIN_STAND_14.0,...,TAXI_ID_20000903,TAXI_ID_20000904,TAXI_ID_20000911,TAXI_ID_20000931,TAXI_ID_20000940,TAXI_ID_20000941,TAXI_ID_20000969,TAXI_ID_20000970,TAXI_ID_20000980,TAXI_ID_20000981
0,-1.740697,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.740649,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.740687,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.740697,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.740672,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_label = train_data["POLYLINE"].apply(lambda x : max(x.count("[") - 1, 0))
train_label = torch.tensor(train_label, dtype = torch.int32)

In [9]:
train_label = 15 * (train_label - 1)
#sns.histplot(train_label)
#plt.show()

In [10]:
#Tasks and something to think about

#I ignored callers' id and make taxi-id a categorical feature(which creates 400 columns and seems not efficient), there may be better ways to do that

#Do we need to remove the rows with 0,1, ... coordinate points in PIPELINE?(It seems weird to have a trip within 1 minute)

#building model

#Use sklearn for something like gradient boosting or random forest, manually build a neural network

#Model Selection: Gradient Boosting,Random Forest,Extra Randomized Trees, SVM, Linear Regression, Logistic Regression, Neural Networks

#Try Ensemble Learning, maybe stacking different models

In [13]:
#I implement a MLP first
import torch.nn as nn
import torch.nn.functional as F


class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(in_features=516,out_features=80),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(in_features=80, out_features=10),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(in_features=10, out_features=1)
        )
        

    def forward(self, x):
        x=self.layer1(x)
        return x

In [14]:
import torch.optim as optim
learning_rate = 0.01
criterion = nn.MSELoss()
model =MLP()
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)
batch_size = 32

In [15]:
def log_rmse(model, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()