In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from datetime import datetime as dt

In [None]:
historical_data = pd.read_csv('C:\\Users\\Jayesh\\Dropbox\\Data Science and Machine Learning\\Datasets\\historical_data.csv')

In [None]:
historical_data.info()

In [None]:
historical_data.dropna(inplace=True)
historical_data.info()

In [None]:
historical_data['market_id'].value_counts()

In [None]:
historical_data.head(10)

In [None]:
historical_data["created_at"] = pd.to_datetime(historical_data['created_at'])
historical_data["actual_delivery_time"] = pd.to_datetime(historical_data['actual_delivery_time'])
historical_data["actual_total_delivery_duration"] = (historical_data["actual_delivery_time"] - historical_data["created_at"]).dt.total_seconds()

In [None]:
historical_data["hour_of_order"] = historical_data['created_at'].dt.hour
plt.hist(historical_data["hour_of_order"], bins = 24)

In [None]:
plt.hist(historical_data["actual_total_delivery_duration"], bins=100, range=(0,10000))

In [None]:
historical_data["actual_total_delivery_duration"].sort_values()

In [None]:
historical_data.drop(historical_data[historical_data["actual_total_delivery_duration"]>8000].index, inplace=True)

In [None]:
historical_data['store_id'].value_counts()

In [None]:
historical_data['store_id'].nunique()

In [None]:
primary_cat = historical_data['store_primary_category'].value_counts()

In [None]:
historical_data['store_primary_category'].nunique()

In [None]:
convert = primary_cat[40:]
historical_data['store_primary_category'] = historical_data['store_primary_category'].apply(lambda x: 'other' if x in convert else x)

In [None]:
historical_data['store_primary_category'].value_counts()

In [None]:
plt.hist(historical_data['total_items'], bins = 10, range = (1,20))

In [None]:
plt.hist(historical_data['subtotal'], bins = 20, range = (100,10000))

In [None]:
plt.hist(historical_data['num_distinct_items'], bins = 10, range = (1,20))

In [None]:
historical_data['total_available_dashers'] = historical_data['total_onshift_dashers'] - historical_data['total_busy_dashers']

In [None]:
len(historical_data[historical_data['total_available_dashers']<0])

In [None]:
historical_data = historical_data.drop(columns = ['created_at', 'actual_delivery_time', 'store_id', 'total_onshift_dashers', 'total_busy_dashers'])

In [None]:
market_id_dummies = pd.get_dummies(historical_data.market_id)
market_id_dummies = market_id_dummies.add_prefix('market_id_')

In [None]:
category_dummies = pd.get_dummies(historical_data.store_primary_category)
category_dummies = category_dummies.add_prefix('category_')

In [None]:
order_protocol_dummies = pd.get_dummies(historical_data.order_protocol)
order_protocol_dummies = order_protocol_dummies.add_prefix('order_protocol_')

In [None]:
hour_dummies = pd.get_dummies(historical_data.hour_of_order)
hour_dummies = hour_dummies.add_prefix('hour_')

In [None]:
final_df = pd.concat([historical_data, market_id_dummies, category_dummies, order_protocol_dummies, hour_dummies], axis=1)

In [None]:
final_df.info()

In [None]:
final_df = final_df.drop(columns = ['store_primary_category', 'order_protocol', 'market_id', 'hour_of_order'])

In [None]:
corr = final_df.corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
corr.abs().unstack().sort_values(ascending = False).drop_duplicates().head(20)

In [None]:
final_df['ratio_of_items'] = final_df['num_distinct_items']/final_df['total_items']

In [None]:
final_df['average_price'] = (final_df['min_item_price'] + final_df['max_item_price'])/2

In [None]:
final_df["avg_price_per_item"] = final_df["subtotal"] / final_df["total_items"]

In [None]:
corr = final_df.corr()
corr.abs().unstack().sort_values(ascending = False).drop_duplicates().head(20)

In [None]:
final_df.drop(columns=['min_item_price', 'max_item_price', 'avg_price_per_item', 'num_distinct_items'], inplace=True)

In [None]:
corr = final_df.corr()
corr.abs().unstack().sort_values(ascending = False).drop_duplicates().head(20)

In [None]:
final_df = final_df.astype("float32")

In [None]:
features = final_df.drop(columns=["actual_total_delivery_duration"]).columns.to_list()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X = final_df[features]
y = final_df["actual_total_delivery_duration"]

In [None]:
scaler = MinMaxScaler()
scaler_fit = scaler.fit(final_df)
scaled = scaler_fit.transform(final_df)
scaled_df = pd.DataFrame(scaled, columns = final_df.columns)

In [None]:
scaled_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df[features], scaled_df['actual_total_delivery_duration'], test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
feature_names = [f"feature {i}" for i in range((X.shape[1]))]
forest = RandomForestRegressor(random_state=42)
forest.fit(X_train, y_train)
feats = {} 
for feature, importance in zip(X.columns, forest.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90, figsize=(15,12))
plt.show()                                              

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA().fit(X_train)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,79,1)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [None]:
models = {"Ridge" : Ridge(),
         "Decision Tree" : DecisionTreeRegressor(),
         "Random Forest" : RandomForestRegressor(),
         "XGBoost" : XGBRegressor(),
         "LightGBM" : LGBMRegressor(),
         "K Nearest Neighbors" : KNeighborsRegressor(),
         "Multi Layer Perceptron" : MLPRegressor()}

In [None]:
def train_model (X_train, X_test, y_train, y_test, model, model_name, scaler):
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    y_predict1 = scaler.inverse_transform(y_predict.reshape(-1, 1))
    y_test1 = scaler.inverse_transform(y_test.values.reshape(-1, 1))
    test_error = mean_squared_error(y_test1, y_predict1, squared=False)
    print("Test error = "'{}'.format(test_error)+" in " + model_name)

In [None]:
scaler = MinMaxScaler()
y_scaler = scaler.fit(final_df["actual_total_delivery_duration"].values.reshape(-1,1))
for model in models:
    train_model(X_train, X_test, y_train, y_test, models[model], model, y_scaler)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.metrics import RootMeanSquaredError
import tensorflow as tf

In [None]:
model = tf.keras.Sequential()
model.add(Dense(40, input_dim = (len(features)), activation = 'relu'))

In [None]:
model.add(Dense(10, activation = 'linear'))
model.add(Dense(1, activation = 'linear'))

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=[RootMeanSquaredError()])

In [None]:
history = model.fit(X_train, y_train, epochs = 50, verbose = 2)

In [None]:
y_predict = model.predict(X_test)

In [None]:
y_scaler = scaler.fit(final_df["actual_total_delivery_duration"].values.reshape(-1,1))
y_predict1 = y_scaler.inverse_transform(y_predict.reshape(-1, 1))
y_test1 = y_scaler.inverse_transform(y_test.values.reshape(-1, 1))

In [None]:
test_error = mean_squared_error(y_test1, y_predict1, squared=False)

In [None]:
test_error