### Import libraries

In [1]:
import warnings
import pandas as pd
import numpy as np

from pyhive import presto
from h3 import h3
from IPython.core.interactiveshell import InteractiveShell
from datetime import date,datetime,timedelta

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

### connection & parameter

In [2]:
connection = presto.connect(
    host='presto-gateway.serving.data.plectrum.dev',
    port=443,
    protocol='https',
    catalog='hive',
    username='manoj.ravirajan@rapido.bike')

In [3]:
yyyymmdd_from = '20230508'

### Reading dataset service_mapping

In [None]:
def get_service_mapping():
    
    service_mapping_query = f"""
    SELECT service_detail_id,
            service_level as service_name,
            service_category,
            service_id,
            city_display_name as city,
            city_id
    FROM hive.datasets.service_mapping
    WHERE 
        service_level = 'Auto'
        AND city_display_name = 'Hyderabad'
    """
    # Filtered for Auto service
    
    service_mapping_df = pd.read_sql(service_mapping_query, connection)
    return service_mapping_df

service_mapping_df = get_service_mapping()
service_mapping_df.head()

In [5]:
service_mapping_df['service_id'].nunique()

1

### Reading dataset supplycursory_history

In [23]:
def get_captian_duration(yyyymmdd_from):
    
    hex_duration_qery = f"""
        select 
            yyyymmdd, quarter_hour, captain_id, servicedetailid, 
            -- location, 
            status_bucket,
            sum(duration) duration
        from
        (
            select
                distinct
                yyyymmdd,
                quarter_hour,
                hhmm,
                FROM_UNIXTIME(CAST(epoch AS DOUBLE) /1000) epoch,
                captain_id,
                servicedetailid,
                location,
                case
                when status = 2 then 'idle'
                when status in (4,5,9) then 'offline'
                else 'in_an_order' end 
                status_bucket,
                duration
            from
                hive.datasets.supplycursory_history
            where 
                yyyymmdd = '{yyyymmdd_from}'
                and servicedetailid = '5ef2bc5b85846b775f97d170'
                --and quarter_hour = '0900'
                --and quarter_hour in ('0900', '0915', '0930', '0945', '1300', '1315', '1330', '1345')
                --and captain_id = '5fa1da792f1376c623a042f2'
        )
        group by
            yyyymmdd, quarter_hour, captain_id, servicedetailid, 
            status_bucket
    """

    hex_duration_df = pd.read_sql(hex_duration_qery, connection)
    
    return hex_duration_df

In [24]:
hex_duration_pan = get_captian_duration(yyyymmdd_from)

In [25]:
# Sample data filter 

hex_duration_pan[
    (hex_duration_pan['captain_id']=='5fa1da792f1376c623a042f2') & 
    (hex_duration_pan['quarter_hour']=='0900')  & 
    (hex_duration_pan['servicedetailid']=='5ef2bc5b85846b775f97d170') 
]

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,status_bucket,duration
64106,20230508,900,5fa1da792f1376c623a042f2,5ef2bc5b85846b775f97d170,offline,91
1903395,20230508,900,5fa1da792f1376c623a042f2,5ef2bc5b85846b775f97d170,in_an_order,338
2326584,20230508,900,5fa1da792f1376c623a042f2,5ef2bc5b85846b775f97d170,idle,301


In [26]:
def get_duration_bucket(duration):
    if duration >0 and duration <= 60:
        return 1
    elif duration >60 and duration < 120:
        return 2
    else:
        return np.int(duration/60)

In [27]:
hex_duration_pan['duration_bucket'] = hex_duration_pan.duration.apply(lambda x :get_duration_bucket(x))
hex_duration_pan.head()

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,status_bucket,duration,duration_bucket
0,20230508,1130,607d98bfc11373156dba4db3,5ef2bc5b85846b775f97d170,in_an_order,900,15
1,20230508,1730,5d9883e761bf5a14d0939067,5ef2bc5b85846b775f97d170,offline,900,15
2,20230508,1730,635a170260f2cc3cbd1d22ba,5ef2bc5b85846b775f97d170,offline,900,15
3,20230508,2330,6394260492c589f2ba667545,5ef2bc5b85846b775f97d170,in_an_order,592,9
4,20230508,1400,63134f638aba0162e1cdc1f3,5ef2bc5b85846b775f97d170,offline,900,15


In [28]:
duration_bucket =  hex_duration_pan.pivot_table(
                                                values='duration_bucket', 
                                                index=['quarter_hour', 'captain_id'], 
                                                columns='status_bucket', 
                                                aggfunc=np.sum).reset_index()

In [29]:
duration_bucket = duration_bucket.fillna(0)
duration_bucket

status_bucket,quarter_hour,captain_id,idle,in_an_order,offline
0,0000,5761079236859ea47773e789,0.0,0.0,15.0
1,0000,582e91bd738d1a4b612cd6c5,0.0,0.0,15.0
2,0000,5852191fcc10f17c57f86d52,0.0,0.0,15.0
3,0000,5857684b417054710ef1db86,1.0,0.0,14.0
4,0000,585a353e014b512b577aabc4,0.0,0.0,15.0
...,...,...,...,...,...
2339446,2345,6458942c26cb9951c5720efe,0.0,0.0,15.0
2339447,2345,6458ac3ebc6df97bab66384b,0.0,0.0,15.0
2339448,2345,6458bbf718815f63ed448ba7,0.0,0.0,15.0
2339449,2345,6458bc92edfb8cc029a3b546,11.0,1.0,3.0


In [34]:
captain_all = duration_bucket[ (duration_bucket['idle'] > 0.0) | (duration_bucket['in_an_order'] > 0.0)]
captain_all

status_bucket,quarter_hour,captain_id,idle,in_an_order,offline
3,0000,5857684b417054710ef1db86,1.0,0.0,14.0
15,0000,59d8758769a56ea91cde6f4b,2.0,2.0,12.0
23,0000,59eb341d0867301dd4efceb3,0.0,15.0,0.0
25,0000,59f3487e133430667879b818,1.0,5.0,8.0
30,0000,5a0b0c2aee2c2d70c3838abe,3.0,1.0,11.0
...,...,...,...,...,...
2339411,2345,64559de45e85bf5e5a43ba74,0.0,15.0,0.0
2339413,2345,6455c004ba906f3895cbda93,15.0,0.0,0.0
2339417,2345,6455f3695e85bf0eaf51527c,5.0,9.0,0.0
2339426,2345,645645af78d8e993635df851,4.0,10.0,0.0


In [21]:
new_df = captain_all.groupby(['quarter_hour','idle', 'offline']).captain_id.nunique().reset_index()

Unnamed: 0,quarter_hour,idle,offline,captain_id
0,0900,0.0,0.0,1133
1,0900,0.0,1.0,8
2,0900,0.0,2.0,5
3,0900,0.0,3.0,2
4,0900,0.0,4.0,7
...,...,...,...,...
122,0900,13.0,1.0,35
123,0900,13.0,2.0,151
124,0900,14.0,0.0,406
125,0900,14.0,1.0,71


In [14]:
duration_bucket_refined = duration_bucket[
                                        (duration_bucket['idle'] > 0) &
                                        (duration_bucket['in_an_order']== 0) & 
                                        (duration_bucket['offline'] != 15) 
                                        ]
duration_bucket_refined

status_bucket,quarter_hour,captain_id,idle,in_an_order,offline
12,0900,599d675b5471495c3aa81a72,2.0,0.0,12.0
17,0900,59e012e49d45a9a5389bda7b,5.0,0.0,9.0
106,0900,5b8d009698cab328cc081bae,15.0,0.0,0.0
176,0900,5bdfa96ad0267d2f1909c861,6.0,0.0,8.0
181,0900,5be415b9e0dc593a84f39663,11.0,0.0,3.0
...,...,...,...,...,...
24214,0900,6449d4a3ce292858bb29dbeb,13.0,0.0,2.0
24226,0900,644a5ac580f9d44ab6b79e20,6.0,0.0,8.0
24260,0900,644f1bc3b4c619019982b0d8,13.0,0.0,2.0
24330,0900,6455c004ba906f3895cbda93,15.0,0.0,0.0


In [22]:
duration_bucket_refined.groupby(['idle', 'offline']).captain_id.count().reset_index()

Unnamed: 0,idle,offline,captain_id
0,1.0,14.0,136
1,2.0,12.0,40
2,2.0,13.0,62
3,3.0,11.0,35
4,3.0,12.0,1
5,4.0,10.0,44
6,4.0,11.0,1
7,5.0,9.0,39
8,6.0,7.0,1
9,6.0,8.0,27


In [16]:
duration_bucket_refined.groupby(['quarter_hour', 'idle']).captain_id.nunique().reset_index()

Unnamed: 0,quarter_hour,idle,captain_id
0,900,1.0,136
1,900,2.0,102
2,900,3.0,36
3,900,4.0,45
4,900,5.0,39
5,900,6.0,28
6,900,7.0,26
7,900,8.0,31
8,900,9.0,18
9,900,10.0,22


In [17]:
duration_bucket_refined_v1 = duration_bucket_refined.groupby(['quarter_hour', 'idle']).captain_id.nunique().reset_index()

In [18]:
duration_bucket_refined_v1

Unnamed: 0,quarter_hour,idle,captain_id
0,900,1.0,136
1,900,2.0,102
2,900,3.0,36
3,900,4.0,45
4,900,5.0,39
5,900,6.0,28
6,900,7.0,26
7,900,8.0,31
8,900,9.0,18
9,900,10.0,22


In [19]:
duration_bucket_refined_v1.groupby(['idle']).captain_id.mean().reset_index()

Unnamed: 0,idle,captain_id
0,1.0,136
1,2.0,102
2,3.0,36
3,4.0,45
4,5.0,39
5,6.0,28
6,7.0,26
7,8.0,31
8,9.0,18
9,10.0,22


In [20]:
view1 = duration_bucket_refined_v1.groupby(['idle']).captain_id.mean().reset_index()
view1['all_captain'] = captain_all.captain_id.nunique()
df_view = view1.assign(captain_percent=lambda x: (x.captain_id * 100 / x.all_captain))
df_view.rename(columns = {'captain_id':'captains'}, inplace = True)
df_view['captains'] = df_view['captains'].round()
df_view.round(2)

Unnamed: 0,idle,captains,all_captain,captain_percent
0,1.0,136,6725,2.02
1,2.0,102,6725,1.52
2,3.0,36,6725,0.54
3,4.0,45,6725,0.67
4,5.0,39,6725,0.58
5,6.0,28,6725,0.42
6,7.0,26,6725,0.39
7,8.0,31,6725,0.46
8,9.0,18,6725,0.27
9,10.0,22,6725,0.33


1. status | duration combination 
2. perct calc based on all cap
3. apr/aor column 