In [1]:
## Import libraries

import warnings
import pandas as pd
import numpy as np

from pyhive import presto
from h3 import h3
from IPython.core.interactiveshell import InteractiveShell
from datetime import date,datetime,timedelta

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [3]:
## Connection

connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike',
    )

In [4]:
## Filter

yyyymmdd_from = '20230508'
yyyymmdd_to = '20230512'

In [5]:
## datasets.supplycursory_history

def get_captian_duration(yyyymmdd_from):
    
    hex_duration_qery = f"""
        SELECT
            *
        FROM
        (
            SELECT
                *,
                row_number() OVER (PARTITION BY yyyymmdd,captain_id,quarter_hour ORDER BY idle_duration DESC) AS max_idle_duration
            FROM
            (
                SELECT
                    yyyymmdd,
                    captain_id,
                    hex_id,
                    quarter_hour,
                    SUM(CASE WHEN status = 'idle' THEN duration ELSE 0 END) AS idle_duration,
                    SUM(CASE WHEN status = 'diff_status' THEN duration ELSE 0 END) AS others_duration,
                    SUM(CASE WHEN status = 'offline' THEN duration ELSE 0 END) AS offline_duration,
                    MAX(CASE WHEN rnk=1 THEN status END) AS last_status
                    --COUNT(hex_id) OVER(PARTITION BY yyyymmdd,captain_id,quarter_hour) AS total_hex
                FROM
                (
                    SELECT 
                        *, 
                        row_number() OVER (PARTITION BY yyyymmdd,captain_id,hex_id,quarter_hour ORDER BY hhmm DESC) AS rnk
                    FROM
                    (
                    SELECT
                        yyyymmdd,
                        captain_id,
                        location AS hex_id,
                        quarter_hour,
                        CASE 
                            WHEN status IN (4,5,9) THEN 'offline' 
                            WHEN status IN (2) THEN 'idle' 
                            ELSE 'in_order' 
                        END AS status,
                        hhmm,
                        SUM(duration) AS duration
                    FROM
                        hive.datasets.supplycursory_history
                    WHERE 
                        yyyymmdd >= '{yyyymmdd_from}'
                        AND yyyymmdd <= '{yyyymmdd_to}'
                        AND servicedetailid = '5ef2bc5b85846b775f97d170'
                        AND substr(quarter_hour,1,2)  not in ('22','23','00','01','02','03','04','05')
                    GROUP BY 1,2,3,4,5,6
                    )
                    ORDER BY 1,2,3,4,5,6
                )
                GROUP BY 1,2,3,4
                ORDER BY 1,2,3,4
            )
        )
        WHERE 
            idle_duration > 0
            AND max_idle_duration = 1
    """

    hex_duration_df = pd.read_sql(hex_duration_qery, connection)
    
    return hex_duration_df

In [6]:
df_supplycursory_history_backup = get_captian_duration(yyyymmdd_from)
df_supplycursory_history = df_supplycursory_history_backup

In [7]:
df_supplycursory_history.head()

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,idle_duration,others_duration,offline_duration,last_status,max_idle_duration
0,20230508,5f3d05ed26c6c02d3740700c,8860a24b1bfffff,1545,896,0,0,idle,1
1,20230508,5f8a9adf402ed558da665418,8860a2586dfffff,1430,19,0,60,offline,1
2,20230508,61bc281d7d4982f686bcabdb,8860a24b6dfffff,2000,181,0,0,idle,1
3,20230509,5ccbfe853d65ca5e25692a89,8860a259bbfffff,2100,27,0,873,offline,1
4,20230509,5f521056fc5f3728f8928e57,8860b525b1fffff,1215,45,0,0,in_order,1


In [8]:
### Outliers

df_supplycursory_history['total_duration'] = df_supplycursory_history['idle_duration'] + df_supplycursory_history['others_duration'] +df_supplycursory_history['offline_duration'] 
df_supplycursory_history = df_supplycursory_history[df_supplycursory_history['total_duration'] <= 900]

In [9]:
df_1 = df_supplycursory_history

In [10]:
## AOR data

def get_order_logs_immutable(yyyymmdd_from):
    
    order_logs_immutable = f"""
        WITH order_stats AS (

            SELECT
                yyyymmdd,
                captain_id,
                pickup_location_hex_8 AS hex_id,
                quarter_hour,
                COUNT(DISTINCT(order_id)) AS order_requested,
                COUNT(DISTINCT(CASE WHEN event_type = 'accepted' THEN order_id END)) AS accepted_orders
            FROM
                (    
                SELECT 
                    date_format(date_parse(yyyymmdd, '%Y%m%d'),'%W') AS week_period,
                    yyyymmdd,
                    captain_id,
                    quarter_hour,
                    order_id,
                    city_name,
                    order_status,
                    event_type,
                    pickup_location_hex_8,
                    captain_location_hex_8
                FROM 
                    orders.order_logs_immutable
                WHERE 
                    yyyymmdd >= '{yyyymmdd_from}'
                    AND yyyymmdd <= '{yyyymmdd_to}'
                    AND service_detail_id = '5ef2bc5b85846b775f97d170'
                    AND city_name = 'Hyderabad'
                    AND service_obj_service_name = 'Auto'
                )
            GROUP BY 1,2,3,4
        )

        SELECT * FROM order_stats
    """

    order_logs_immutable_df = pd.read_sql(order_logs_immutable, connection)
    
    return order_logs_immutable_df

In [11]:
df_order_logs_immutable_backup = get_order_logs_immutable(yyyymmdd_from)
df_order_logs_immutable = df_order_logs_immutable_backup

In [12]:
df_order_logs_immutable.head()

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,order_requested,accepted_orders
0,20230511,61737c044c6ba166dbdc5dcb,8860b196dbfffff,830,10,0
1,20230511,641d78ea3c3388e5eac47651,8860a24a6dfffff,2145,2,0
2,20230509,5e47eedd6f0fbf269061e1a4,8860a25a05fffff,1745,2,0
3,20230508,,8860a25b5bfffff,2230,43,0
4,20230511,5f7ac7807d99a741665ad206,8860a25913fffff,1145,1,0


In [13]:
## Join Idle time & AOR

df_merge = pd.merge(df_1, df_order_logs_immutable,  how='left', 
                    left_on=['yyyymmdd','captain_id','hex_id','quarter_hour'], 
                    right_on = ['yyyymmdd','captain_id','hex_id','quarter_hour'])
df_merge = df_merge.fillna(0)
df_merge

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,idle_duration,others_duration,offline_duration,last_status,max_idle_duration,total_duration,order_requested,accepted_orders
0,20230508,5f3d05ed26c6c02d3740700c,8860a24b1bfffff,1545,896,0,0,idle,1,896,0.0,0.0
1,20230508,5f8a9adf402ed558da665418,8860a2586dfffff,1430,19,0,60,offline,1,79,3.0,0.0
2,20230508,61bc281d7d4982f686bcabdb,8860a24b6dfffff,2000,181,0,0,idle,1,181,6.0,0.0
3,20230509,5ccbfe853d65ca5e25692a89,8860a259bbfffff,2100,27,0,873,offline,1,900,0.0,0.0
4,20230509,5f521056fc5f3728f8928e57,8860b525b1fffff,1215,45,0,0,in_order,1,45,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1596906,20230508,641eb3d2480d2d193898da2b,8860a25861fffff,2000,12,0,709,offline,1,721,2.0,0.0
1596907,20230509,625284d7fc8d848475af2f6b,8860b52431fffff,0845,304,0,596,idle,1,900,0.0,0.0
1596908,20230511,625053da5b17eb43f7ca73d0,8860b52ce1fffff,1545,57,0,843,offline,1,900,0.0,0.0
1596909,20230511,63e5b80cf5b29b6b9a03614a,8860a25993fffff,1515,884,0,0,idle,1,884,0.0,0.0


In [14]:
# df_merge['time_bucket'] = 	np.where(df_merge['quarter_hour'].str[:2].isin(['00','01','02','03','04','05','06','07']) , 'rest_morning', 	
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['08','09','10','11']) , 'morning_peak', 	
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['12','13','14','15','16']) , 'afternoon', 
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['17','18','19','20','21']) , 'evening_peak', 'rest_evening'))))
#

In [15]:
df_2 = df_merge.groupby(['yyyymmdd','quarter_hour']).agg({'captain_id':'count', 'order_requested':'sum', 'accepted_orders':'sum' }).reset_index()
df_2

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,order_requested,accepted_orders
0,20230508,0600,2612,679.0,112.0
1,20230508,0615,2631,798.0,123.0
2,20230508,0630,2848,956.0,126.0
3,20230508,0645,3124,633.0,132.0
4,20230508,0700,3574,610.0,143.0
...,...,...,...,...,...
315,20230512,2045,3561,6335.0,397.0
316,20230512,2100,3401,5824.0,374.0
317,20230512,2115,3284,5422.0,306.0
318,20230512,2130,3227,5292.0,329.0


In [16]:
df_quarter_hour_total_captains = df_1.groupby(['yyyymmdd','quarter_hour']).captain_id.nunique().reset_index(name='all_captains')
df_quarter_hour_total_captains.head()

Unnamed: 0,yyyymmdd,quarter_hour,all_captains
0,20230508,600,2612
1,20230508,615,2631
2,20230508,630,2848
3,20230508,645,3124
4,20230508,700,3574


In [17]:
df_refined_data = pd.merge(df_2, df_quarter_hour_total_captains,  how='left', 
                    left_on=['yyyymmdd', 'quarter_hour'], 
                    right_on = ['yyyymmdd', 'quarter_hour'])
df_refined_data

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,order_requested,accepted_orders,all_captains
0,20230508,0600,2612,679.0,112.0,2612
1,20230508,0615,2631,798.0,123.0,2631
2,20230508,0630,2848,956.0,126.0,2848
3,20230508,0645,3124,633.0,132.0,3124
4,20230508,0700,3574,610.0,143.0,3574
...,...,...,...,...,...,...
315,20230512,2045,3561,6335.0,397.0,3561
316,20230512,2100,3401,5824.0,374.0,3401
317,20230512,2115,3284,5422.0,306.0,3284
318,20230512,2130,3227,5292.0,329.0,3227


In [18]:
df_refined_data['distribution'] = 100*df_refined_data['captain_id']/df_refined_data['all_captains']
df_refined_data['aor'] = 100*df_refined_data['accepted_orders']/df_refined_data['order_requested']
df_refined_data.fillna(0)

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,order_requested,accepted_orders,all_captains,distribution,aor
0,20230508,0600,2612,679.0,112.0,2612,100.0,16.494845
1,20230508,0615,2631,798.0,123.0,2631,100.0,15.413534
2,20230508,0630,2848,956.0,126.0,2848,100.0,13.179916
3,20230508,0645,3124,633.0,132.0,3124,100.0,20.853081
4,20230508,0700,3574,610.0,143.0,3574,100.0,23.442623
...,...,...,...,...,...,...,...,...
315,20230512,2045,3561,6335.0,397.0,3561,100.0,6.266772
316,20230512,2100,3401,5824.0,374.0,3401,100.0,6.421703
317,20230512,2115,3284,5422.0,306.0,3284,100.0,5.643674
318,20230512,2130,3227,5292.0,329.0,3227,100.0,6.216931


In [19]:
df_refined_data['time_bucket'] = 	np.where(df_refined_data['quarter_hour'].str[:2].isin(['00','01','02','03','04','05','06','07']) , 'rest_morning', 	
                            np.where(df_refined_data['quarter_hour'].str[:2].isin(['08','09','10','11']) , 'morning_peak', 	
                            np.where(df_refined_data['quarter_hour'].str[:2].isin(['12','13','14','15','16']) , 'afternoon', 
                            np.where(df_refined_data['quarter_hour'].str[:2].isin(['17','18','19','20','21']) , 'evening_peak', 'rest_evening'))))
df_refined_data

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,order_requested,accepted_orders,all_captains,distribution,aor,time_bucket
0,20230508,0600,2612,679.0,112.0,2612,100.0,16.494845,rest_morning
1,20230508,0615,2631,798.0,123.0,2631,100.0,15.413534,rest_morning
2,20230508,0630,2848,956.0,126.0,2848,100.0,13.179916,rest_morning
3,20230508,0645,3124,633.0,132.0,3124,100.0,20.853081,rest_morning
4,20230508,0700,3574,610.0,143.0,3574,100.0,23.442623,rest_morning
...,...,...,...,...,...,...,...,...,...
315,20230512,2045,3561,6335.0,397.0,3561,100.0,6.266772,evening_peak
316,20230512,2100,3401,5824.0,374.0,3401,100.0,6.421703,evening_peak
317,20230512,2115,3284,5422.0,306.0,3284,100.0,5.643674,evening_peak
318,20230512,2130,3227,5292.0,329.0,3227,100.0,6.216931,evening_peak


In [20]:
df_v1 = df_refined_data.groupby(['yyyymmdd']).agg({'distribution':'mean', 'aor':'mean'}).reset_index()
df_v1

Unnamed: 0,yyyymmdd,distribution,aor
0,20230508,100.0,10.987928
1,20230509,100.0,12.349907
2,20230510,100.0,11.35755
3,20230511,100.0,10.046641
4,20230512,100.0,10.090399


In [21]:
df_v2 = df_refined_data.groupby(['yyyymmdd','time_bucket']).agg({'distribution':'mean', 'aor':'mean'}).reset_index()
df_v2.head()

Unnamed: 0,yyyymmdd,time_bucket,distribution,aor
0,20230508,afternoon,100.0,10.163429
1,20230508,evening_peak,100.0,8.163379
2,20230508,morning_peak,100.0,12.072711
3,20230508,rest_morning,100.0,17.940984
4,20230509,afternoon,100.0,12.399784


In [22]:
df_v3 = pd.pivot_table(df_refined_data, values= ['aor'] , index=['time_bucket'],
                       columns=['yyyymmdd']).reset_index()
df_v3

Unnamed: 0_level_0,time_bucket,aor,aor,aor,aor,aor
yyyymmdd,Unnamed: 1_level_1,20230508,20230509,20230510,20230511,20230512
0,afternoon,10.163429,12.399784,9.96463,9.54538,9.188647
1,evening_peak,8.163379,8.220286,7.416157,7.51647,7.075309
2,morning_peak,12.072711,12.388021,11.632955,10.220278,11.531601
3,rest_morning,17.940984,22.473035,24.142525,17.277945,17.000097


In [23]:
##df_refined_data[df_refined_data['quarter_hour'] == '0900']

In [24]:
# df_refined_data.to_csv("quarter_hour_20230514.csv", index= False)