In [1]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
#importing datasets
historical = pd.read_csv("Downloads/historical_data.csv", usecols=['created_at', 'actual_delivery_time', 
                                                                   'estimated_order_place_duration',
                                                                   'estimated_store_to_consumer_driving_duration'])
predict    = pd.read_csv("OneDrive/Desktop/predict_data.csv")

In [3]:
#filling NA with mean if column
historical["estimated_store_to_consumer_driving_duration"].fillna(historical.estimated_store_to_consumer_driving_duration.mean(), inplace=True)

In [4]:
#dropping unneeded columns
predict.drop(['market_id','store_id', 'store_primary_category',
       'order_protocol', 'total_items', 'subtotal', 'num_distinct_items',
       'min_item_price', 'max_item_price', 'total_onshift_dashers',
       'total_busy_dashers', 'total_outstanding_orders',
       'platform','delivery_id'], axis = 1, inplace=True)

#filling NA with mean in column
predict["estimated_store_to_consumer_driving_duration"].fillna(predict.estimated_store_to_consumer_driving_duration.mean(), inplace=True)

In [5]:
#limiting training dataset
historical = historical.sample(20000)

# Handling Dates

In [6]:
import datetime as dt

#converting date to timestamp 
historical['created_at'] = pd.to_datetime(historical['created_at'])
historical['created_at'] = historical['created_at'].map(dt.datetime.timestamp)

historical['actual_delivery_time'] = pd.to_datetime(historical['actual_delivery_time'])
historical['actual_delivery_time'] = historical['actual_delivery_time'].map(dt.datetime.timestamp)

predict['created_at'] = pd.to_datetime(predict['created_at'])
predict['created_at'] = predict['created_at'].map(dt.datetime.timestamp)

# Split Data

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [8]:
#splitting data for fitting 
train = historical.drop(['actual_delivery_time'], axis = 1)
test = historical['actual_delivery_time']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train, test,shuffle = True, test_size = 0.2)

# Linear Regression

In [10]:
#training model
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression()

In [11]:
#testing model 
x_predict = regr.predict(x_test)

#showing models performance on test data
RMSE = mean_squared_error( y_test, x_predict, squared= False)
MAE  =  mean_absolute_error(y_test, x_predict)
rsquared = r2_score(y_test, x_predict)
MSE = mean_squared_error(y_test, x_predict)
MAE = mean_absolute_error(y_test, x_predict)

In [12]:
#printing metrics
print ("RMSE: ",  RMSE)
print ("MAE:  ",  MAE)
print ("R2:   ",  rsquared)
print ("MSE:  ",  MSE)
print ("MAE:  ",  MAE)

RMSE:  1070.5496599968158
MAE:   773.6635555307269
R2:    0.9999976477482351
MSE:   1146076.574519298
MAE:   773.6635555307269


In [13]:
#predicting on new dataset
predicted_delivery_time_regr = regr.predict(predict)

#placing prediction into dataframe
df = pd.DataFrame(predicted_delivery_time_regr, columns=['predicted_delivery_time'])

#converting back to datetine
df = df['predicted_delivery_time'].map(dt.datetime.fromtimestamp)
df2 = predict['created_at'].map(dt.datetime.fromtimestamp)

#taking the duration
td = df-df2

#conveting into seconds
td.dt.total_seconds()

0        3249.873951
1        2973.324200
2        3034.626931
3        3270.729143
4        3071.616773
            ...     
54773    3171.752884
54774    2747.494287
54775    2550.917838
54776    2884.053860
54777    2976.411707
Length: 54778, dtype: float64

In [14]:
#add to prediction dataset
predict = pd.read_csv("OneDrive/Desktop/predict_data.csv")
predict['predicted_duration'] = td.dt.total_seconds()

In [15]:
#output to csv
predict.to_csv(path_or_buf = "OneDrive/Desktop/predicted_data.csv")

# Decision Tree

In [16]:
from sklearn import tree

In [17]:
#training model
dt_reg = tree.DecisionTreeRegressor(max_depth=5)
dt_reg.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=5)

In [18]:
#testing model 
x_predict = dt_reg.predict(x_test)

#showing models performance on test data
RMSE = mean_squared_error( y_test, x_predict, squared= False)
MAE  =  mean_absolute_error(y_test, x_predict)
rsquared = r2_score(y_test, x_predict)
MSE = mean_squared_error(y_test, x_predict)
MAE = mean_absolute_error(y_test, x_predict)

In [19]:
#printing metrics
print ("RMSE: ",  RMSE)
print ("MAE:  ",  MAE)
print ("R2:   ",  rsquared)
print ("MSE:  ",  MSE)
print ("MAE:  ",  MAE)

RMSE:  11507.167119321259
MAE:   9654.006444534778
R2:    0.9997282265621248
MSE:   132414895.1119883
MAE:   9654.006444534778


# Random Forest Regressor

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble

In [21]:
#training model
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train, y_train)

RandomForestRegressor()

In [22]:
#testing model 
x_predict = rf_reg.predict(x_test)

#showing models performance on test data
RMSE = mean_squared_error( y_test, x_predict, squared= False)
MAE  =  mean_absolute_error(y_test, x_predict)
rsquared = r2_score(y_test, x_predict)
MSE = mean_squared_error(y_test, x_predict)
MAE = mean_absolute_error(y_test, x_predict)

In [23]:
#printing metrics
print ("RMSE: ",  RMSE)
print ("MAE:  ",  MAE)
print ("R2:   ",  rsquared)
print ("MSE:  ",  MSE)
print ("MAE:  ",  MAE)

RMSE:  1117.4812076870512
MAE:   805.4081376290321
R2:    0.9999974369880903
MSE:   1248764.2495337105
MAE:   805.4081376290321


# Gradient Boosting Regressor

In [24]:
#training model
gb_reg = ensemble.GradientBoostingRegressor()
gb_reg.fit(x_train, y_train)

GradientBoostingRegressor()

In [25]:
#testing model 
x_predict = gb_reg.predict(x_test)

#showing models performance on test data
RMSE = mean_squared_error( y_test, x_predict, squared= False)
MAE  =  mean_absolute_error(y_test, x_predict)
rsquared = r2_score(y_test, x_predict)
MSE = mean_squared_error(y_test, x_predict)
MAE = mean_absolute_error(y_test, x_predict)

In [26]:
#printing metrics
print ("RMSE: ",  RMSE)
print ("MAE:  ",  MAE)
print ("R2:   ",  rsquared)
print ("MSE:  ",  MSE)
print ("MAE:  ",  MAE)

RMSE:  3310.8619441546725
MAE:   2286.7321946053503
R2:    0.9999775015649075
MSE:   10961806.81325166
MAE:   2286.7321946053503
