### Task: Given 26th March 2021 demand - clean dataset based on business team rules; we need to forecast demand for first 3 (30mins internal) demand value of 27th March 2021.

Author: [Shaurya Uppal](https://www.linkedin.com/in/shaurya-uppal/)

In [1]:
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt, ceil, floor
from datetime import datetime, timedelta

In [2]:
def round_timestamp_30interval(x):
    if type(x)==str:
        x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return x- timedelta(minutes=x.minute%30, seconds=x.second, microseconds=x.microsecond)

def time_features(data):
    data['mins'] = data.ts.dt.minute
    data['hour'] = data.ts.dt.hour
    data['day'] = data.ts.dt.day
    data['month'] = data.ts.dt.month
    data['dayofweek'] = data.ts.dt.dayofweek
    data['quarter'] = data.ts.dt.quarter
    return data

def prediction_without_lag(df):
    return predict_without_lag.predict(df[['pickup_cluster','mins','hour','month','quarter','dayofweek']])

def prediction_with_lag(df):
    return predict_with_lag.predict(df[['pickup_cluster', 'mins', 'hour', 'month', 'quarter',
           'dayofweek', 'lag_1', 'lag_2', 'lag_3','rolling_mean']])

def shift_with_lag_and_rollingmean(df):
    df = df.sort_values(by=['pickup_cluster', 'ts']).drop_duplicates(subset=['ts','pickup_cluster'])
    df = df.set_index(['ts', 'pickup_cluster', 'mins', 'hour', 'month', 'quarter', 'dayofweek'])
    df['lag_1'] = df.groupby(level=['pickup_cluster'])['request_count'].shift(1)
    df['lag_2'] = df.groupby(level=['pickup_cluster'])['request_count'].shift(2)
    df['lag_3'] = df.groupby(level=['pickup_cluster'])['request_count'].shift(3)
    df['rolling_mean'] = df.groupby(level=['pickup_cluster'])['request_count'].apply(lambda x: x.rolling(window = 3).mean()).shift(1)

    df = df.reset_index(drop = False).dropna()
    df = df[['ts', 'pickup_cluster', 'mins', 'hour', 'month', 'quarter',
           'dayofweek', 'lag_1', 'lag_2', 'lag_3','rolling_mean','request_count']]
    return df


In [3]:

df = pd.read_csv('../Data/test_dataset/cleaned_test_booking_data.csv', compression = 'gzip', low_memory=False)
cluster_model = load('../Model/pickup_cluster_model.joblib')
predict_without_lag = load('../Model/prediction_model_without_lag.joblib')
predict_with_lag = load('../Model/prediction_model.joblib')


### Use Clustering Kmeans Model for Geospacial Feature - `pickup_cluster`

In [4]:
df['pickup_cluster'] = cluster_model.predict(df[['pick_lat','pick_lng']])
df.head(10)

Unnamed: 0,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,pickup_cluster
0,2021-03-26 06:49:38,-1,12.903468,77.63708,12.916259,77.675476,31
1,2021-03-26 15:14:23,0,12.903838,77.591774,12.890039,77.59372,24
2,2021-03-26 15:57:32,6,12.963516,77.67474,12.912828,77.62731,32
3,2021-03-26 23:34:53,7,12.944017,77.56427,12.967625,77.60806,45
4,2021-03-26 23:45:56,9,12.98327,77.75207,12.963221,77.7484,41
5,2021-03-26 18:54:05,11,12.919469,77.6711,12.933288,77.60731,49
6,2021-03-26 18:42:49,15,12.947335,77.68431,12.974627,77.606064,0
7,2021-03-26 23:14:56,15,12.979332,77.64059,12.947475,77.68423,27
8,2021-03-26 10:59:13,17,12.923716,77.60741,12.922842,77.59324,18
9,2021-03-26 16:44:09,53,12.888448,77.57724,12.937987,77.568726,11


### Data preparation and processing

In [5]:
df['ts'] = np.vectorize(round_timestamp_30interval)(df['ts'])
df['ts'] = pd.to_datetime(df['ts'])

df = df[['ts','number','pickup_cluster']]
df=df.groupby(by = ['ts','pickup_cluster']).count().reset_index()
df.columns = ['ts','pickup_cluster','request_count']

## Adding Dummy pickup cluster -1

## Change this Data based on your data
l = [datetime(2021,3,26,00,00,00) + timedelta(minutes = 30*i) for i in range(0,51)]
lt = []
for x in l:
    lt.append([x, -1, 0])
temp = pd.DataFrame(lt, columns = ['ts','pickup_cluster','request_count'])
df = df.append(temp,ignore_index=True)

data = df.set_index(['ts', 'pickup_cluster']).unstack().fillna(value=0).asfreq(freq='30Min').stack().sort_index(level=1).reset_index()

# Removing Dummy Cluster
data = data[data.pickup_cluster>=0]

df = time_features(data)

### Model without Lag (past data) requirement

In [6]:
data_without_lag = df[df['ts']>=datetime(2021,3,27,00,00,00)].__copy__()
data_without_lag['request_count'] = prediction_without_lag(data_without_lag)
data_without_lag

Unnamed: 0,ts,pickup_cluster,request_count,mins,hour,day,month,dayofweek,quarter
99,2021-03-27 00:00:00,0,6.871498,0,0,27,3,5,1
100,2021-03-27 00:30:00,0,3.866220,30,0,27,3,5,1
101,2021-03-27 01:00:00,0,1.812880,0,1,27,3,5,1
150,2021-03-27 00:00:00,1,1.014596,0,0,27,3,5,1
151,2021-03-27 00:30:00,1,0.651106,30,0,27,3,5,1
...,...,...,...,...,...,...,...,...,...
2395,2021-03-27 00:30:00,48,3.350919,30,0,27,3,5,1
2396,2021-03-27 01:00:00,48,1.508004,0,1,27,3,5,1
2445,2021-03-27 00:00:00,49,4.490408,0,0,27,3,5,1
2446,2021-03-27 00:30:00,49,3.047003,30,0,27,3,5,1


In [7]:
data_without_lag.to_csv('../Data/test_dataset_prediction_output/prediction_without_lag_model.csv',index = False, compression = 'gzip')

### Using Iteration 3 - Best Model with Lag Features and Rolling Means (Recursive Multi-Step Forecast used)

In [8]:
start_date = datetime(2021,3,27,00,00,00) 
for x in range(3):
    df = shift_with_lag_and_rollingmean(df)
    df.loc[df[df['ts']==start_date+timedelta(minutes=30*x)].index,'request_count'] = prediction_with_lag(df[df['ts']==start_date+timedelta(minutes=30*x)])

In [9]:
data_with_lag = df[df['ts']>=datetime(2021,3,27,00,00,00)].__copy__()
data_with_lag

Unnamed: 0,ts,pickup_cluster,mins,hour,month,quarter,dayofweek,lag_1,lag_2,lag_3,rolling_mean,request_count
42,2021-03-27 00:00:00,0,0,0,3,1,5,8.000000,14.000000,19.0,13.666667,4.744022
43,2021-03-27 00:30:00,0,30,0,3,1,5,4.744022,8.000000,14.0,8.914674,3.215082
44,2021-03-27 01:00:00,0,0,1,3,1,5,3.215082,4.744022,8.0,5.319701,2.031213
87,2021-03-27 00:00:00,1,0,0,3,1,5,1.000000,3.000000,4.0,2.666667,0.789398
88,2021-03-27 00:30:00,1,30,0,3,1,5,0.789398,1.000000,3.0,1.596466,0.478711
...,...,...,...,...,...,...,...,...,...,...,...,...
2068,2021-03-27 00:30:00,48,30,0,3,1,5,4.353994,5.000000,11.0,6.784665,2.741235
2069,2021-03-27 01:00:00,48,0,1,3,1,5,2.741235,4.353994,5.0,4.031743,1.616475
2112,2021-03-27 00:00:00,49,0,0,3,1,5,7.000000,10.000000,13.0,10.000000,4.073674
2113,2021-03-27 00:30:00,49,30,0,3,1,5,4.073674,7.000000,10.0,7.024558,2.679101


In [10]:
data_with_lag.to_csv('../Data/test_dataset_prediction_output/prediction_with_lag_model.csv',index = False, compression = 'gzip')

### Thank You for Choosing this Project. Hope you liked it. Check our Scripts Folder for Modularized py scripts. 