In [3]:
## Import libraries

import warnings
import pandas as pd
import numpy as np

from pyhive import presto
from h3 import h3
from IPython.core.interactiveshell import InteractiveShell
from datetime import date,datetime,timedelta

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [4]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [5]:
## Connection

connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike',
    )

In [6]:
## Filter

yyyymmdd_from = '20230508'
yyyymmdd_to = '20230512'

In [7]:
## datasets.supplycursory_history

def get_captian_duration(yyyymmdd_from):
    
    hex_duration_qery = f"""
        SELECT
            *
        FROM
        (
            SELECT
                *,
                row_number() OVER (PARTITION BY yyyymmdd,captain_id,quarter_hour ORDER BY idle_duration DESC) AS max_idle_duration
            FROM
            (
                SELECT
                    yyyymmdd,
                    captain_id,
                    hex_id,
                    quarter_hour,
                    SUM(CASE WHEN status = 'idle' THEN duration ELSE 0 END) AS idle_duration,
                    SUM(CASE WHEN status = 'diff_status' THEN duration ELSE 0 END) AS others_duration,
                    SUM(CASE WHEN status = 'offline' THEN duration ELSE 0 END) AS offline_duration,
                    MAX(CASE WHEN rnk=1 THEN status END) AS last_status
                    --COUNT(hex_id) OVER(PARTITION BY yyyymmdd,captain_id,quarter_hour) AS total_hex
                FROM
                (
                    SELECT 
                        *, 
                        row_number() OVER (PARTITION BY yyyymmdd,captain_id,hex_id,quarter_hour ORDER BY hhmm DESC) AS rnk
                    FROM
                    (
                    SELECT
                        yyyymmdd,
                        captain_id,
                        location AS hex_id,
                        quarter_hour,
                        CASE 
                            WHEN status IN (4,5,9) THEN 'offline' 
                            WHEN status IN (2) THEN 'idle' 
                            ELSE 'in_order' 
                        END AS status,
                        hhmm,
                        SUM(duration) AS duration
                    FROM
                        hive.datasets.supplycursory_history
                    WHERE 
                        yyyymmdd >= '{yyyymmdd_from}'
                        AND yyyymmdd <= '{yyyymmdd_to}'
                        AND servicedetailid = '5ef2bc5b85846b775f97d170'
                        AND substr(quarter_hour,1,2)  not in ('22','23','00','01','02','03','04','05')
                    GROUP BY 1,2,3,4,5,6
                    )
                    ORDER BY 1,2,3,4,5,6
                )
                GROUP BY 1,2,3,4
                ORDER BY 1,2,3,4
            )
        )
        WHERE 
            idle_duration > 0
            AND max_idle_duration = 1
    """

    hex_duration_df = pd.read_sql(hex_duration_qery, connection)
    
    return hex_duration_df

In [8]:
df_supplycursory_history_backup = get_captian_duration(yyyymmdd_from)
df_supplycursory_history = df_supplycursory_history_backup

In [9]:
df_supplycursory_history.head()

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,idle_duration,others_duration,offline_duration,last_status,max_idle_duration
0,20230508,5f22886001c5e913ce194bd9,8860a24a1dfffff,845,876,0,0,idle,1
1,20230508,62774c275488a3b9106f1af7,8860a25b2bfffff,1500,33,0,158,in_order,1
2,20230509,5cb5f59e54bc7263ff3d0889,8860b52f4dfffff,1015,133,0,647,idle,1
3,20230509,5cc53a5554bc7263ff53acca,8860a25831fffff,1515,247,0,6,idle,1
4,20230510,5f6312f4baac866b52b4234c,8860b52733fffff,1700,900,0,0,idle,1


In [10]:
### Outliers

df_supplycursory_history['total_duration'] = df_supplycursory_history['idle_duration'] + df_supplycursory_history['others_duration'] +df_supplycursory_history['offline_duration'] 
df_supplycursory_history = df_supplycursory_history[df_supplycursory_history['total_duration'] <= 900]

In [11]:
### Sec to Min function

def get_duration_bucket(duration):
    if duration >0 and duration <= 60:
        return 1
    elif duration > 60 and duration < 120:
        return 2
    else:
        return np.int(duration/60)

In [12]:
df_1 = df_supplycursory_history

In [13]:
df_1['idle_duration'] = df_1.idle_duration.apply(lambda x :get_duration_bucket(x))
df_1['offline_duration'] = df_1.offline_duration.apply(lambda x :get_duration_bucket(x))
df_1.head()

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,idle_duration,others_duration,offline_duration,last_status,max_idle_duration,total_duration
0,20230508,5f22886001c5e913ce194bd9,8860a24a1dfffff,845,14,0,0,idle,1,876
1,20230508,62774c275488a3b9106f1af7,8860a25b2bfffff,1500,1,0,2,in_order,1,191
2,20230509,5cb5f59e54bc7263ff3d0889,8860b52f4dfffff,1015,2,0,10,idle,1,780
3,20230509,5cc53a5554bc7263ff53acca,8860a25831fffff,1515,4,0,1,idle,1,253
4,20230510,5f6312f4baac866b52b4234c,8860b52733fffff,1700,15,0,0,idle,1,900


In [14]:
## AOR data

def get_order_logs_immutable(yyyymmdd_from):
    
    order_logs_immutable = f"""
        WITH order_stats AS (

            SELECT
                yyyymmdd,
                captain_id,
                pickup_location_hex_8 AS hex_id,
                quarter_hour,
                COUNT(DISTINCT(order_id)) AS order_requested,
                COUNT(DISTINCT(CASE WHEN event_type = 'accepted' THEN order_id END)) AS accepted_orders
            FROM
                (    
                SELECT 
                    date_format(date_parse(yyyymmdd, '%Y%m%d'),'%W') AS week_period,
                    yyyymmdd,
                    captain_id,
                    quarter_hour,
                    order_id,
                    city_name,
                    order_status,
                    event_type,
                    pickup_location_hex_8,
                    captain_location_hex_8
                FROM 
                    orders.order_logs_immutable
                WHERE 
                    yyyymmdd >= '{yyyymmdd_from}'
                    AND yyyymmdd <= '{yyyymmdd_to}'
                    AND service_detail_id = '5ef2bc5b85846b775f97d170'
                    AND city_name = 'Hyderabad'
                    AND service_obj_service_name = 'Auto'
                )
            GROUP BY 1,2,3,4
        )

        SELECT * FROM order_stats
    """

    order_logs_immutable_df = pd.read_sql(order_logs_immutable, connection)
    
    return order_logs_immutable_df

In [15]:
df_order_logs_immutable_backup = get_order_logs_immutable(yyyymmdd_from)
df_order_logs_immutable = df_order_logs_immutable_backup

In [16]:
df_order_logs_immutable.head()

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,order_requested,accepted_orders
0,20230509,5f2965704b3a1541e7c5b255,8860a25a07fffff,2330,8,0
1,20230512,605017141630b45d553b75b1,8860a2591dfffff,1330,2,0
2,20230511,6132f63a0a1f1b86f5f77299,8860a25a2bfffff,800,1,0
3,20230511,5c49361b4a267149c7794575,8860a25b55fffff,1800,1,0
4,20230512,5f8166584002048a5b5739d3,8860b52d43fffff,1315,1,0


In [17]:
## Join Idle time & AOR

df_merge = pd.merge(df_1, df_order_logs_immutable,  how='left', 
                    left_on=['yyyymmdd','captain_id','hex_id','quarter_hour'], 
                    right_on = ['yyyymmdd','captain_id','hex_id','quarter_hour'])
df_merge = df_merge.fillna(0)
df_merge

Unnamed: 0,yyyymmdd,captain_id,hex_id,quarter_hour,idle_duration,others_duration,offline_duration,last_status,max_idle_duration,total_duration,order_requested,accepted_orders
0,20230508,5f22886001c5e913ce194bd9,8860a24a1dfffff,0845,14,0,0,idle,1,876,3.0,0.0
1,20230508,62774c275488a3b9106f1af7,8860a25b2bfffff,1500,1,0,2,in_order,1,191,14.0,0.0
2,20230509,5cb5f59e54bc7263ff3d0889,8860b52f4dfffff,1015,2,0,10,idle,1,780,0.0,0.0
3,20230509,5cc53a5554bc7263ff53acca,8860a25831fffff,1515,4,0,1,idle,1,253,1.0,0.0
4,20230510,5f6312f4baac866b52b4234c,8860b52733fffff,1700,15,0,0,idle,1,900,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1596906,20230511,62b6afee6a3d0d661b33af7c,8860a25865fffff,1400,1,0,11,offline,1,708,0.0,0.0
1596907,20230511,62c660d417af9a2d856c06bc,8860a258abfffff,1830,6,0,1,in_order,1,406,0.0,0.0
1596908,20230512,625c3f586bcdc3ebe4554c8d,8860a2595bfffff,1715,1,0,1,in_order,1,60,7.0,0.0
1596909,20230512,6337fe1fe90ec6025b7ba695,8860b52c07fffff,0800,15,0,0,idle,1,900,0.0,0.0


In [18]:
# df_merge['time_bucket'] = 	np.where(df_merge['quarter_hour'].str[:2].isin(['00','01','02','03','04','05','06','07']) , 'rest_morning', 	
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['08','09','10','11']) , 'morning_peak', 	
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['12','13','14','15','16']) , 'afternoon', 
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['17','18','19','20','21']) , 'evening_peak', 'rest_evening'))))
#

In [19]:
df_2 = df_merge.groupby(['yyyymmdd','quarter_hour','idle_duration', 'offline_duration']).agg({'captain_id':'count', 'order_requested':'sum', 'accepted_orders':'sum' }).reset_index()
df_2

Unnamed: 0,yyyymmdd,quarter_hour,idle_duration,offline_duration,captain_id,order_requested,accepted_orders
0,20230508,0600,1,0,70,24.0,17.0
1,20230508,0600,1,1,16,21.0,4.0
2,20230508,0600,1,2,13,6.0,1.0
3,20230508,0600,1,3,3,3.0,2.0
4,20230508,0600,1,4,3,0.0,0.0
...,...,...,...,...,...,...,...
35486,20230512,2145,13,1,6,3.0,1.0
35487,20230512,2145,13,2,6,1.0,0.0
35488,20230512,2145,14,0,34,8.0,0.0
35489,20230512,2145,14,1,15,3.0,0.0


In [20]:
df_quarter_hour_total_captains = df_1.groupby(['yyyymmdd','quarter_hour']).captain_id.nunique().reset_index(name='all_captains')
df_quarter_hour_total_captains.head()

Unnamed: 0,yyyymmdd,quarter_hour,all_captains
0,20230508,600,2612
1,20230508,615,2631
2,20230508,630,2848
3,20230508,645,3124
4,20230508,700,3574


In [21]:
df_refined_data = pd.merge(df_2, df_quarter_hour_total_captains,  how='left', 
                    left_on=['yyyymmdd', 'quarter_hour'], 
                    right_on = ['yyyymmdd', 'quarter_hour'])
df_refined_data

Unnamed: 0,yyyymmdd,quarter_hour,idle_duration,offline_duration,captain_id,order_requested,accepted_orders,all_captains
0,20230508,0600,1,0,70,24.0,17.0,2612
1,20230508,0600,1,1,16,21.0,4.0,2612
2,20230508,0600,1,2,13,6.0,1.0,2612
3,20230508,0600,1,3,3,3.0,2.0,2612
4,20230508,0600,1,4,3,0.0,0.0,2612
...,...,...,...,...,...,...,...,...
35486,20230512,2145,13,1,6,3.0,1.0,2960
35487,20230512,2145,13,2,6,1.0,0.0,2960
35488,20230512,2145,14,0,34,8.0,0.0,2960
35489,20230512,2145,14,1,15,3.0,0.0,2960


In [22]:
df_refined_data['distribution'] = 100*df_refined_data['captain_id']/df_refined_data['all_captains']
df_refined_data['aor'] = 100*df_refined_data['accepted_orders']/df_refined_data['order_requested']
df_refined_data.fillna(0)

Unnamed: 0,yyyymmdd,quarter_hour,idle_duration,offline_duration,captain_id,order_requested,accepted_orders,all_captains,distribution,aor
0,20230508,0600,1,0,70,24.0,17.0,2612,2.679939,70.833333
1,20230508,0600,1,1,16,21.0,4.0,2612,0.612557,19.047619
2,20230508,0600,1,2,13,6.0,1.0,2612,0.497703,16.666667
3,20230508,0600,1,3,3,3.0,2.0,2612,0.114855,66.666667
4,20230508,0600,1,4,3,0.0,0.0,2612,0.114855,0.000000
...,...,...,...,...,...,...,...,...,...,...
35486,20230512,2145,13,1,6,3.0,1.0,2960,0.202703,33.333333
35487,20230512,2145,13,2,6,1.0,0.0,2960,0.202703,0.000000
35488,20230512,2145,14,0,34,8.0,0.0,2960,1.148649,0.000000
35489,20230512,2145,14,1,15,3.0,0.0,2960,0.506757,0.000000


In [23]:
##df_refined_data[df_refined_data['quarter_hour'] == '0900']

In [24]:
# df_refined_data.to_csv("quarter_hour_20230514.csv", index= False)

In [25]:
df_v1 = df_refined_data.groupby(['yyyymmdd','idle_duration', 'offline_duration']).agg({'distribution':'mean', 'aor':'mean'}).reset_index()
df_v2 = df_v1[df_v1['idle_duration'] + df_v1['offline_duration'] == 15]
df_v2.head()

Unnamed: 0,yyyymmdd,idle_duration,offline_duration,distribution,aor
14,20230508,1,14,2.987359,4.161406
28,20230508,2,13,1.280754,3.252076
41,20230508,3,12,0.026277,
53,20230508,4,11,0.027388,
64,20230508,5,10,0.020966,


In [26]:
df_v2.pivot(index = 'idle_duration' , columns ='yyyymmdd', values =['distribution', 'aor']).fillna(0)

Unnamed: 0_level_0,distribution,distribution,distribution,distribution,distribution,aor,aor,aor,aor,aor
yyyymmdd,20230508,20230509,20230510,20230511,20230512,20230508,20230509,20230510,20230511,20230512
idle_duration,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,2.987359,2.855818,2.884388,2.914177,2.960824,4.161406,3.730198,4.19579,3.479216,2.971596
2,1.280754,1.488792,1.180991,1.086863,1.009892,3.252076,2.120401,0.0,2.827381,7.514368
3,0.026277,0.024281,0.026309,0.020473,0.026205,0.0,0.0,0.0,0.0,0.0
4,0.027388,0.018819,0.02703,0.02642,0.024668,0.0,0.0,0.0,0.0,0.0
5,0.020966,0.023541,0.027217,0.031514,0.028485,0.0,0.0,0.0,0.0,0.0
6,0.021484,0.022117,0.032808,0.023866,0.027568,0.0,0.0,0.0,0.0,0.0
7,0.025185,0.017831,0.022346,0.025445,0.020757,0.0,0.0,0.0,0.0,0.0
8,0.023091,0.020495,0.033316,0.025042,0.023649,0.0,0.0,0.0,0.0,0.0
9,0.023011,0.02194,0.021281,0.023468,0.022464,0.0,0.0,0.0,0.0,0.0
10,0.020067,0.020135,0.026231,0.024352,0.023301,0.0,0.0,0.0,0.0,0.0


In [27]:
df_v3 = pd.pivot_table(df_v1, values= ['distribution', 'aor'] , index=['idle_duration', 'offline_duration'],
                       columns=['yyyymmdd']).reset_index()
df_v3

Unnamed: 0_level_0,idle_duration,offline_duration,aor,aor,aor,aor,aor,distribution,distribution,distribution,distribution,distribution
yyyymmdd,Unnamed: 1_level_1,Unnamed: 2_level_1,20230508,20230509,20230510,20230511,20230512,20230508,20230509,20230510,20230511,20230512
0,1,0,29.211972,33.916411,31.650142,28.182791,27.80008,8.702217,7.231098,8.209783,8.205302,8.44271
1,1,1,10.088643,12.6889,13.520154,9.27311,10.029686,7.075856,5.087936,9.336442,11.318868,12.365078
2,1,2,8.564362,11.175651,10.006869,6.863913,7.473819,1.834534,1.456429,2.391685,2.770135,3.174974
3,1,3,12.301921,9.453877,7.649813,11.058948,8.98571,0.690891,0.537861,0.826432,0.923788,0.998284
4,1,4,9.695521,10.452565,8.119973,7.889605,7.485502,0.637621,0.491651,0.783838,0.825245,0.959435
5,1,5,6.542313,9.81135,6.517625,7.306819,9.61033,0.616327,0.501905,0.732762,0.804961,0.89197
6,1,6,6.938497,11.178568,9.881249,5.725383,7.376569,0.613214,0.486183,0.70044,0.793075,0.833757
7,1,7,8.357475,11.500189,5.765322,9.554226,8.582761,0.633896,0.460295,0.692291,0.821834,0.825652
8,1,8,6.382045,6.581391,5.522401,6.287683,6.51791,0.613477,0.502307,0.719832,0.791913,0.853539
9,1,9,5.531337,8.172036,8.037857,4.088548,4.399927,0.615674,0.498048,0.704879,0.797129,0.877483
