In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from geopy import distance
import pickle
from utils_dump_load import load_from_pickle, dump_to_pickle

<h1> EDA

<h2> Loading data

In [2]:
pickle_filename = "data/data.pkl"
df_train, df_test = load_from_pickle(pickle_filename)

<h2> Feature Selection

In [3]:
features_not_in_test = []
for feature_name in df_train.columns:
    if feature_name not in df_test.columns:
        features_not_in_test.append(feature_name)
features_not_in_test

['Arrival at Destination - Day of Month',
 'Arrival at Destination - Weekday (Mo = 1)',
 'Arrival at Destination - Time',
 'Time from Pickup to Arrival']

In [4]:
steps = ["Placement", "Confirmation", "Arrival at Pickup", "Pickup"]

In [5]:
features_to_drop = ["Vehicle Type", "geospeed"]
#option1
#features_to_drop +=  ["User Id", "Rider Id", "Pickup - Time"]
#option2
#features_to_drop += ["Pickup Lat", "Pickup Long", "Destination Lat", "Destination Long"]
#option3
#features_to_drop += ['Platform Type', 'Personal or Business', "Precipitation in millimeters"]

In [6]:
for step in steps:
    #Let's keep Pickup data (last step prior run)
    if step not in ['Pickup', "Arrival at Destination"]:
        #features_to_drop.append(step + " - " + "Time")
        #features_to_drop.append(step + " - " + "Delta")
        #features_to_drop.append(step + " - " + "Weekday (Mo = 1)")
        #features_to_drop.append(step + " - Weekend")
        features_to_drop.append(step + " - " + "Day of Month")
        pass

In [7]:
features_to_drop

['Vehicle Type',
 'geospeed',
 'Placement - Day of Month',
 'Confirmation - Day of Month',
 'Arrival at Pickup - Day of Month']

In [8]:
to_drop = features_to_drop + features_not_in_test

<h2> Scalling data

In [9]:
features_to_scale = []
features_not_scaled = []
for feature_name in df_train.columns:
    if df_train[feature_name].dtype == np.float64 and feature_name not in to_drop:
        features_to_scale.append(feature_name)
    elif feature_name not in to_drop:
        features_not_scaled.append(feature_name)

In [10]:
features_to_scale

['Placement - Time',
 'Confirmation - Time',
 'Arrival at Pickup - Time',
 'Pickup - Time',
 'Distance (KM)',
 'Temperature',
 'Precipitation in millimeters',
 'Pickup Lat',
 'Pickup Long',
 'Destination Lat',
 'Destination Long',
 'No_Of_Orders',
 'Age',
 'Average_Rating',
 'No_of_Ratings',
 'speed',
 'Placement - Delta',
 'Confirmation - Delta',
 'Arrival at Pickup - Delta',
 'geopy_distance']

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train_scaled = scaler.fit_transform(df_train[features_to_scale])
df_test_scaled = scaler.transform(df_test[features_to_scale])

In [12]:
df_train_scaled = pd.DataFrame(df_train_scaled, index=df_train.index, columns=features_to_scale)
df_test_scaled = pd.DataFrame(df_test_scaled, index=df_test.index, columns=features_to_scale)

In [13]:
df_train = pd.concat([df_train_scaled, df_train[features_not_scaled + to_drop]], axis=1)
df_test = pd.concat([df_test_scaled, df_test[features_not_scaled + features_to_drop]], axis=1)

<h2> Defining our features X and target y

In [14]:
X = df_train.drop(to_drop, axis=1)
y = df_train["Time from Pickup to Arrival"]
X.shape, y.shape

((21201, 34), (21201,))

<h1> Apply ML model

<h2> Defining X_train, X_test

In [15]:
from sklearn.model_selection import train_test_split, KFold,cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

<h2> Benchmark model

In [16]:
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import HuberRegressor,SGDRegressor,ElasticNet, Ridge,LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
from catboost import CatBoostRegressor

In [17]:
def rmsle_cv(model,X_train,y_train,n_folds=5):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(np.mean(rmse))

In [18]:
models = [RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, LGBMRegressor, ElasticNet, 
          LinearRegression, HuberRegressor, Ridge, KNeighborsRegressor, ExtraTreesRegressor]

In [19]:
models_sel = [RandomForestRegressor, GradientBoostingRegressor, LGBMRegressor, LinearRegression, Ridge]

In [20]:
def benchmark_models():
    for Model in models_sel:
        #Model fit on full data
        model = Model()
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        err1 = rmsle_cv(model, X_train, y_train)
        err2 = np.sqrt(mean_squared_error(y_test, y_pred))
        print(f"{model_name} fit on full data :  cross val loss rmse => {err1}")
        print(f"{model_name} fit on full data :  single rmse => {err2}")

In [21]:
benchmark_models()



RandomForestRegressor fit on full data :  cross val loss rmse => 787.4894169995938
RandomForestRegressor fit on full data :  single rmse => 787.5271812527986
GradientBoostingRegressor fit on full data :  cross val loss rmse => 735.2401302662322
GradientBoostingRegressor fit on full data :  single rmse => 732.9680836797106
LGBMRegressor fit on full data :  cross val loss rmse => 726.0650091813766
LGBMRegressor fit on full data :  single rmse => 742.503061786367
LinearRegression fit on full data :  cross val loss rmse => 773.7922139705654
LinearRegression fit on full data :  single rmse => 770.2347624505123
Ridge fit on full data :  cross val loss rmse => 773.721491578433
Ridge fit on full data :  single rmse => 770.2337761789245


<h2> CatBoost model

In [22]:
cat_b =  CatBoostRegressor(loss_function='RMSE', silent=True)

In [23]:
cat_b.fit(X_train, y_train)
y_pred = cat_b.predict(X_test)

In [24]:
cat_b.best_score_

{'learn': {'RMSE': 683.3207267275733}}

In [25]:
order = cat_b.feature_importances_.argsort()[::-1]
list(zip(X_train.columns[order], cat_b.feature_importances_[order]))

[('Distance (KM)', 27.43961195606389),
 ('geopy_distance', 22.87356676073456),
 ('speed', 13.41253018457266),
 ('Arrival at Pickup - Delta', 7.405979665947537),
 ('Destination Lat', 4.3322965494038215),
 ('Pickup Lat', 3.641563726130375),
 ('Confirmation - Delta', 3.3795864068206094),
 ('Average_Rating', 2.575744968682698),
 ('Placement - Delta', 2.1192180181058093),
 ('Age', 1.797611020868705),
 ('Destination Long', 1.7388508203494175),
 ('No_of_Ratings', 1.5777616452517915),
 ('No_Of_Orders', 1.527054548093214),
 ('Pickup Long', 1.442022810242505),
 ('User Id', 0.9220051868597816),
 ('Rider Id', 0.5120371367425458),
 ('Temperature', 0.479155701560765),
 ('Arrival at Pickup - Time', 0.4557658202670592),
 ('Placement - Time', 0.2912988544275589),
 ('Pickup - Weekday (Mo = 1)', 0.2732076208529346),
 ('Arrival at Pickup - Weekday (Mo = 1)', 0.24891980244632175),
 ('Placement - Weekday (Mo = 1)', 0.24759806814167032),
 ('Pickup - Time', 0.24262704433386517),
 ('Confirmation - Weekday (Mo 

<h1> Dump to submission file

In [26]:
X__test = df_test.drop(features_to_drop, axis=1)
X__test.shape, X_train.shape

((7068, 34), (16960, 34))

In [27]:
X__test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Placement - Time,7068.0,-0.01236,1.001022,-2.479172,-0.814842,-0.059424,0.777265,3.72028
Confirmation - Time,7068.0,-0.012455,1.000139,-2.424949,-0.82526,-0.0582,0.779518,3.691178
Arrival at Pickup - Time,7068.0,-0.012613,1.000647,-2.314103,-0.814995,-0.05187,0.779206,3.830805
Pickup - Time,7068.0,-0.013608,0.999891,-2.392565,-0.819158,-0.051974,0.784092,3.933809
Distance (KM),7068.0,-0.00839,0.98357,-1.50058,-0.794967,-0.265757,0.616259,6.61397
Temperature,7068.0,-0.009555,0.996161,-2.995054,-0.708433,0.015195,0.641119,2.637986
Precipitation in millimeters,7068.0,-0.002248,0.901526,-1.814123,-0.490136,-0.366933,0.403844,28.242898
Pickup Lat,7068.0,-0.018782,0.984852,-4.983951,-0.646787,0.005523,0.793129,4.363618
Pickup Long,7068.0,-0.003092,1.01308,-4.201462,-0.711423,-0.107936,0.492424,4.797827
Destination Lat,7068.0,0.009357,0.983148,-4.022419,-0.526672,-0.018319,0.612372,7.246856


In [28]:
model_sel1 = GradientBoostingRegressor()
model_sel2 = LGBMRegressor()
model_sel3 = CatBoostRegressor(silent=True)
#model_sel = LinearRegression()
#model_sel = RandomForestRegressor()

In [29]:
model_sel1.fit(X, y)
model_sel2.fit(X, y)
model_sel3.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7fa5c8a31510>

In [30]:
model_sel3.best_score_

{'learn': {'RMSE': 684.3320214726081}}

In [31]:
order = model_sel3.feature_importances_.argsort()[::-1]
list(zip(X_train.columns[order], model_sel3.feature_importances_[order]))

[('Distance (KM)', 27.983112915100037),
 ('geopy_distance', 22.747713964172675),
 ('speed', 12.940341874226945),
 ('Arrival at Pickup - Delta', 6.823206037841023),
 ('Destination Lat', 5.080550920291668),
 ('Pickup Lat', 3.6033165429441176),
 ('Confirmation - Delta', 3.435238099175831),
 ('Average_Rating', 2.68942023213255),
 ('Destination Long', 2.129027251837533),
 ('Age', 1.7867929566505225),
 ('Placement - Delta', 1.7178973084184863),
 ('No_Of_Orders', 1.5455645649187506),
 ('No_of_Ratings', 1.4574286670426124),
 ('Pickup Long', 1.4470836130730478),
 ('User Id', 1.1918816740815337),
 ('Rider Id', 0.4771075883883025),
 ('Temperature', 0.43332453377430485),
 ('Confirmation - Weekday (Mo = 1)', 0.3401249291196521),
 ('Confirmation - Time', 0.30025561618468843),
 ('Arrival at Pickup - Time', 0.2940873370006031),
 ('Placement - Weekday (Mo = 1)', 0.27497254596044957),
 ('Pickup - Day of Month', 0.23405812116187502),
 ('Pickup - Weekday (Mo = 1)', 0.20148869052216392),
 ('Placement - Tim

In [32]:
y__pred1 = model_sel1.predict(X__test)
y__pred2 = model_sel2.predict(X__test)
y__pred3 = model_sel3.predict(X__test)

y__pred = np.mean([y__pred2, y__pred3], axis=0)

In [33]:
submit_file = "data/SampleSubmission.csv"
df_submit = pd.read_csv(submit_file, index_col="Order_No")
df_submit.index = df_test.index
df_submit["Time from Pickup to Arrival"] = y__pred
df_submit.head()

Unnamed: 0_level_0,Time from Pickup to Arrival
Order No,Unnamed: 1_level_1
Order_No_19248,1729.328993
Order_No_2699,3188.853713
Order_No_21486,1907.525266
Order_No_19336,2549.006641
Order_No_20374,2931.753184


In [34]:
df_submit.to_csv("to_submit/SampleSubmission_mean.csv")

<h1> to be improved

- Improve CatBoost Hyperparameters