In [25]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import pickle
import os
import h5py

In [52]:
data = pd.read_csv("data/predict_data.csv")

In [40]:
# convert time features into datetime type
# and order by the order created time
df = data
df['created_at']= pd.to_datetime(df["created_at"]) 
# df['actual_delivery_time']= pd.to_datetime(data["actual_delivery_time"]) 
df = df.sort_values(by=['created_at'],ascending=True)

# convert creat_at to hour, weekday, and month features
df['start_hour'] = df['created_at'].dt.hour
df['start_weekday'] = df['created_at'].dt.weekday
df['start_month'] = df['created_at'].dt.month

df.head()

Unnamed: 0,market_id,created_at,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,delivery_id,platform,start_hour,start_weekday,start_month
27577,1.0,2015-02-18 14:50:28,266,breakfast,1.0,3,2500,3,350,1300,1.0,0.0,0.0,446,425.0,99834,ios,14,2,2
50927,3.0,2015-02-18 14:59:26,6463,dessert,4.0,2,1198,1,599,599,,,,251,877.0,240577,ios,14,2,2
47372,1.0,2015-02-18 15:04:22,6049,french,1.0,3,1575,2,325,625,0.0,0.0,0.0,446,350.0,34717,android,15,2,2
22766,1.0,2015-02-18 15:13:50,4355,fast,4.0,3,900,1,300,300,1.0,0.0,0.0,251,299.0,228294,ios,15,2,2
1385,1.0,2015-02-18 15:22:38,2549,sandwich,1.0,2,1870,2,350,1395,3.0,0.0,0.0,446,1022.0,104625,ios,15,2,2


In [8]:
df['store_id_showup'] = None
# df['store_id'].fillna(value=0, inplace=True)
for i in df['store_id'].value_counts().index:
    cnt = df['store_id'].value_counts()[i]
    idxs = df[df['store_id']==i].index
    df.loc[idxs,['store_id_showup']] = cnt

In [9]:
df['store_id_showup'] = pd.cut(df['store_id_showup'], bins=[0,100,300,500,1000], 
                               labels=["0-100", "100-300", "300-500", "500-1000"])

In [10]:
df.drop(['store_id'],axis = 1,inplace = True)

In [11]:
df.drop(['created_at','platform','delivery_id'],axis = 1,inplace = True)

In [15]:
df['market_id'] = df['market_id'].astype('str')
df['store_primary_category'] = df['store_primary_category'].astype('str')
df['order_protocol'] = df['order_protocol'].astype('str')

In [50]:
with open("model/preprocessor.pkl",'rb') as infile:
    loaded_pre = pickle.load(infile)
preprocessor = loaded_pre['model']

In [23]:
X_prep = preprocessor.transform(df)

feature_names = list(preprocessor.named_transformers_['cat'][1].get_feature_names(cat_ftrs)) + \
                preprocessor.transformers_[1][-1]

df_pre = pd.DataFrame(data=X_prep,columns=feature_names)
print(df_pre.shape)

(54778, 130)


In [26]:
h5f = h5py.File('data/pred_df.H5', "w")
h5f.create_dataset('pred_df', data=df_pre, compression="gzip", compression_opts=9)
h5f.close()

## Read Predictions and Output Result

In [35]:
with open("data/predictions.txt", "rb") as fp:   # Unpickling
    pred = pickle.load(fp)[0]

In [41]:
data['prediction'] = pred

In [47]:
final_result = data[['delivery_id', 'prediction']]
final_result.head()

Unnamed: 0,delivery_id,prediction
0,194096,2710.882812
1,236895,2802.008545
2,190868,2689.011475
3,183076,2594.957764
4,186200,3545.247314


In [53]:
final_result.to_csv(r'result/data_to_predict.csv', index = False)