<h1 style="text-align:center">Machine Learning</h1>

In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import DMatrix
from xgboost import cv

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from time import time



<h3 style="text-align:center">Combine dataset</h3>

In [16]:
# Load files
train_df = pd.read_pickle('train_df.p')
test_df = pd.read_pickle('test_df.p')
target  = pd.read_pickle('target.p')

train_OSRM = pd.read_pickle('train_OSRM.p')
test_OSRM = pd.read_pickle('test_OSRM.p')

In [17]:
train_df.shape

(1458621, 16)

In [18]:
train_OSRM.shape

(1458643, 4)

In [5]:
# train.merge(train_street_info, how='left', on='id')

train_df = train_OSRM.merge(train_df, how='right', on='id')
test_df = test_OSRM.merge(test_df, how='right', on='id')

In [6]:
print("Train shape:", train_df.shape)
print("Target shape:", target.shape)
print("Test shape:", test_df.shape)

Train shape: (1458621, 19)
Target shape: (1458621, 1)
Test shape: (625134, 18)


In [7]:
train_df.head()

Unnamed: 0,id,total_distance,total_travel_time,number_of_steps,vendor_id,passenger_count,store_and_fwd_flag,trip_duration,weekend,pickup_pca0,pickup_pca1,pickup_cluster,sqrt_distance,Pickup_NumDayWeek,Month,Pickup_Day,Pickup_Hour,Pickup_Minute,Pickup_NumWeekYear
0,id2875421,2009.1,164.9,5.0,2,1,0,455,0,0.0083,0.017177,67,1.224313,0,3,14,17,24,11
1,id2377394,2513.2,332.0,6.0,1,1,0,663,1,0.007128,-0.012224,49,1.343881,6,6,12,0,43,23
2,id3504673,1779.4,235.8,4.0,2,1,0,429,0,0.037106,-0.030242,94,1.218982,2,4,6,19,32,14
3,id2181028,1614.9,140.1,5.0,2,1,0,435,1,-0.001287,0.042269,84,1.090378,5,3,26,13,30,12
4,id0801584,1393.5,189.4,5.0,2,3,0,443,1,0.009499,-0.008546,64,1.048452,5,1,30,22,1,4


<h3 style="text-align:center">Split-out dataset</h3>

In [8]:
# Test options and evaluation metric
num_folds = 10 
seed = 46
test_size = 0.20
scoring = 'r2'

In [19]:
X = train_df.drop(['id', 'trip_duration'], axis=1).values
Y = target.values.ravel()

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=test_size,\
                                                   random_state=seed)


In [10]:
print(X_train.shape, y_train.shape, "\n", X_val.shape, y_val.shape)

(1166896, 17) (1166896,) 
 (291725, 17) (291725,)


<h3 style="text-align:center">Tuning Algorithm</h3>

In [26]:
# Create DMatrix to make XGboost more efficient
xgdmat = DMatrix(X_train, y_train)
testdmat = DMatrix(X_val)

In [15]:
start = time()

our_params = {'eta': 1, 'seed':seed, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'max_depth':5, 'min_child_weight':50} 

final_gb = xgb.train(our_params, xgdmat, num_boost_round = 100)

print("elapsed time:", round(time()-start, 3), "s")

# Test to see how well we did on test set
y_pred = final_gb.predict(testdmat)
accuracy = r2_score(y_val, y_pred)
print(accuracy)

elapsed time: 80.233 s
-0.0122849496736


In [23]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

-0.0001511116325


In [33]:
start = time()

our_params = {'eta': 0.05, 'seed':seed, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'max_depth':5, 'min_child_weight':10} 

final_gb = xgb.train(our_params, xgdmat, num_boost_round = 300)

print("elapsed time:", round(time()-start, 3), "s")

# Test to see how well we did on test set
y_pred = final_gb.predict(testdmat)
accuracy = r2_score(y_val, y_pred)
print(accuracy)

elapsed time: 229.021 s
0.687916419522


In [29]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [33]:
start = time()

our_params = {'eta': 0.05, 'seed':seed, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'max_depth':5, 'min_child_weight':10} 

final_gb = xgb.train(
    our_params, 
    xgdmat,
    300,
    watchlist,
    
    early_stopping_rounds=100, 
#     verbose_eval=10
                    )

print("elapsed time:", round(time()-start, 3), "s")

# Test to see how well we did on test set
y_pred = final_gb.predict(testdmat)
accuracy = r2_score(y_val, y_pred)
print(accuracy)

[0]	train-rmse:5.7219	valid-rmse:5.72136
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[1]	train-rmse:5.43799	valid-rmse:5.43749
[2]	train-rmse:5.16843	valid-rmse:5.16795
[3]	train-rmse:4.91242	valid-rmse:4.91198
[4]	train-rmse:4.66933	valid-rmse:4.66894
[5]	train-rmse:4.43855	valid-rmse:4.43818
[6]	train-rmse:4.2194	valid-rmse:4.21907
[7]	train-rmse:4.01348	valid-rmse:4.01321
[8]	train-rmse:3.81581	valid-rmse:3.81558
[9]	train-rmse:3.62817	valid-rmse:3.62797
[10]	train-rmse:3.4501	valid-rmse:3.44995
[11]	train-rmse:3.2811	valid-rmse:3.28102
[12]	train-rmse:3.12071	valid-rmse:3.12067
[13]	train-rmse:2.9685	valid-rmse:2.96849
[14]	train-rmse:2.82581	valid-rmse:2.82585
[15]	train-rmse:2.6887	valid-rmse:2.68878
[16]	train-rmse:2.5587	valid-rmse:2.55883
[17]	train-rmse:2.4354	valid-rmse:2.43558
[18]	train-rmse:2.31848	valid-rmse:2.31871
[19]	train-rmse:2.20769	valid-rmse:2.20797
[20]	train-r

[180]	train-rmse:0.446603	valid-rmse:0.451419
[181]	train-rmse:0.446545	valid-rmse:0.451361
[182]	train-rmse:0.446487	valid-rmse:0.451312
[183]	train-rmse:0.44645	valid-rmse:0.451286
[184]	train-rmse:0.446375	valid-rmse:0.451215
[185]	train-rmse:0.446317	valid-rmse:0.451179
[186]	train-rmse:0.446276	valid-rmse:0.451142
[187]	train-rmse:0.446225	valid-rmse:0.451102
[188]	train-rmse:0.446152	valid-rmse:0.451032
[189]	train-rmse:0.446106	valid-rmse:0.450998
[190]	train-rmse:0.446039	valid-rmse:0.450944
[191]	train-rmse:0.445994	valid-rmse:0.450906
[192]	train-rmse:0.445929	valid-rmse:0.450847
[193]	train-rmse:0.445895	valid-rmse:0.450831
[194]	train-rmse:0.445855	valid-rmse:0.450808
[195]	train-rmse:0.44581	valid-rmse:0.450768
[196]	train-rmse:0.445746	valid-rmse:0.450715
[197]	train-rmse:0.445668	valid-rmse:0.450641
[198]	train-rmse:0.445579	valid-rmse:0.450557
[199]	train-rmse:0.445522	valid-rmse:0.450503
[200]	train-rmse:0.445491	valid-rmse:0.450483
[201]	train-rmse:0.445434	valid-rmse

In [None]:
start = time()

our_params = {'eta': 0.05, 'seed':seed, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'max_depth':5, 'min_child_weight':10} 

final_gb = xgb.train(
    our_params, 
    xgdmat,
    3000,
    watchlist,
    
    early_stopping_rounds=100, 
#     verbose_eval=10
                    )

print("elapsed time:", round(time()-start, 3), "s")

# Test to see how well we did on test set
y_pred = final_gb.predict(testdmat)
accuracy = r2_score(y_val, y_pred)
print(accuracy)

[0]	train-rmse:5.7219	valid-rmse:5.72136
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[1]	train-rmse:5.43799	valid-rmse:5.43749
[2]	train-rmse:5.16843	valid-rmse:5.16795
[3]	train-rmse:4.91242	valid-rmse:4.91198
[4]	train-rmse:4.66933	valid-rmse:4.66894
[5]	train-rmse:4.43855	valid-rmse:4.43818
[6]	train-rmse:4.2194	valid-rmse:4.21907
[7]	train-rmse:4.01348	valid-rmse:4.01321
[8]	train-rmse:3.81581	valid-rmse:3.81558
[9]	train-rmse:3.62817	valid-rmse:3.62797
[10]	train-rmse:3.4501	valid-rmse:3.44995
[11]	train-rmse:3.2811	valid-rmse:3.28102
[12]	train-rmse:3.12071	valid-rmse:3.12067
[13]	train-rmse:2.9685	valid-rmse:2.96849
[14]	train-rmse:2.82581	valid-rmse:2.82585
[15]	train-rmse:2.6887	valid-rmse:2.68878
[16]	train-rmse:2.5587	valid-rmse:2.55883
[17]	train-rmse:2.4354	valid-rmse:2.43558
[18]	train-rmse:2.31848	valid-rmse:2.31871
[19]	train-rmse:2.20769	valid-rmse:2.20797
[20]	train-r

[180]	train-rmse:0.446603	valid-rmse:0.451419
[181]	train-rmse:0.446545	valid-rmse:0.451361
[182]	train-rmse:0.446487	valid-rmse:0.451312
[183]	train-rmse:0.44645	valid-rmse:0.451286
[184]	train-rmse:0.446375	valid-rmse:0.451215
[185]	train-rmse:0.446317	valid-rmse:0.451179
[186]	train-rmse:0.446276	valid-rmse:0.451142
[187]	train-rmse:0.446225	valid-rmse:0.451102
[188]	train-rmse:0.446152	valid-rmse:0.451032
[189]	train-rmse:0.446106	valid-rmse:0.450998
[190]	train-rmse:0.446039	valid-rmse:0.450944
[191]	train-rmse:0.445994	valid-rmse:0.450906
[192]	train-rmse:0.445929	valid-rmse:0.450847
[193]	train-rmse:0.445895	valid-rmse:0.450831
[194]	train-rmse:0.445855	valid-rmse:0.450808
[195]	train-rmse:0.44581	valid-rmse:0.450768
[196]	train-rmse:0.445746	valid-rmse:0.450715
[197]	train-rmse:0.445668	valid-rmse:0.450641
[198]	train-rmse:0.445579	valid-rmse:0.450557
[199]	train-rmse:0.445522	valid-rmse:0.450503
[200]	train-rmse:0.445491	valid-rmse:0.450483
[201]	train-rmse:0.445434	valid-rmse

[359]	train-rmse:0.440573	valid-rmse:0.447056
[360]	train-rmse:0.440541	valid-rmse:0.447027
[361]	train-rmse:0.440522	valid-rmse:0.447022
[362]	train-rmse:0.440513	valid-rmse:0.447015
[363]	train-rmse:0.440488	valid-rmse:0.447022
[364]	train-rmse:0.440464	valid-rmse:0.447014
[365]	train-rmse:0.440451	valid-rmse:0.447005
[366]	train-rmse:0.440442	valid-rmse:0.447
[367]	train-rmse:0.440435	valid-rmse:0.446997
[368]	train-rmse:0.440413	valid-rmse:0.446991
[369]	train-rmse:0.440402	valid-rmse:0.446983
[370]	train-rmse:0.440385	valid-rmse:0.44697
[371]	train-rmse:0.44038	valid-rmse:0.446969
[372]	train-rmse:0.440372	valid-rmse:0.446968
[373]	train-rmse:0.440335	valid-rmse:0.446938
[374]	train-rmse:0.440294	valid-rmse:0.446902
[375]	train-rmse:0.44029	valid-rmse:0.446905
[376]	train-rmse:0.440275	valid-rmse:0.446891
[377]	train-rmse:0.440242	valid-rmse:0.446867
[378]	train-rmse:0.440222	valid-rmse:0.446878
[379]	train-rmse:0.440194	valid-rmse:0.446858
[380]	train-rmse:0.440184	valid-rmse:0.4