In [1]:
import warnings
import pandas as pd
import numpy as np

from pyhive import presto
from h3 import h3
from IPython.core.interactiveshell import InteractiveShell
from datetime import date,datetime,timedelta

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

#  Presto Config

In [2]:
connection = presto.connect(
    host='presto-gateway.serving.data.plectrum.dev',
    port=443,
    protocol='https',
    catalog='hive',
    username='dharmendra.k@rapido.bike'
)


In [3]:
yyyymmdd_from = '20230403'
yyyymmdd_to   = '20230409'

# Get Raw Base data for Captain's oline duration

In [4]:

# online_duration_qery = f"""
# select 
#     *,
#     0.0 start_epoch,
#     0.0 end_epoch
# from 
# (
# select
#     service service_level, 
#     servicedetailid,
#     captain_id,
#     location,
#     description status,
#     duration,
#     epoch,
#     yyyymmdd,
#     quarter_hour,
#     row_number() over( partition by yyyymmdd,quarter_hour,captain_id,servicedetailid order by duration desc ) n_rank
# from
#     hive.datasets.supplycursory_history
# where 
#     yyyymmdd = '20230403'
#     and description = 'ready_for_ride'
# )
# where n_rank = 1
# """

# online_duration_df = pd.read_sql(online_duration_qery, connection)


In [5]:
def get_service_mapping():
    
    service_mapping_query = f"""
    SELECT service_detail_id,
            service_level as service_name,
            service_category,
            service_id,
            city_display_name as city,
            city_id
    FROM hive.datasets.service_level_mapping

    """

    service_mapping_df = pd.read_sql(service_mapping_query, connection)
    return service_mapping_df

service_mapping_df = get_service_mapping()

In [6]:
service_mapping_df.head()

Unnamed: 0,service_detail_id,service_name,service_category,service_id,city,city_id
0,61a5d4076b307119c988d0ca,swiggy,delivery,58dcea3824565ce21202bca3,bhilai,5c7587c672b3484f01b59f08
1,6156687ffb9a73c501952288,swiggy,delivery,58dcea3824565ce21202bca3,mumbai,5bc5ac7012477c2ece769595
2,60fed3a85d74ca03f9e71ef2,auto,auto,5bd6c6e2e79cc313a94728d0,mumbai,5bc5ac7012477c2ece769595
3,5d46e8d7452374735369f57a,eat fit,delivery,5d075ec4d6e1d6165cb75cc0,hyderabad,5740135d4fdf4798208bba24
4,60c1c498907ff64c49c63182,swiggy,delivery,58dcea3824565ce21202bca3,jodhpur,5c30be96f0c2c508a5be1929


In [7]:
service_mapping_df.query("city == 'bangalore'")


Unnamed: 0,service_detail_id,service_name,service_category,service_id,city,city_id
294,5e8c1663290a9555210a3ae3,kaleyra,delivery,5e8c12a4290a9555210a3ae2,bangalore,572ca7ff116b5db3057bd814
295,60663b9371be8afb3ceeaf30,wndw,delivery,6065cf425c3d1c2ddf8cf3da,bangalore,572ca7ff116b5db3057bd814
296,5faac5a18e21b80007ce9142,paytm mall,delivery,5faac4b78e21b80007ce9055,bangalore,572ca7ff116b5db3057bd814
297,5f5917da73447c3f60d81cec,the garima,delivery,5f59178d69358ca50e40754e,bangalore,572ca7ff116b5db3057bd814
298,5ebd4480f6ada1b0d4b4d736,signcatch,delivery,5ebd4454f6ada1260cb4d735,bangalore,572ca7ff116b5db3057bd814
299,58dceb1f24565ce21202bca4,swiggy,delivery,58dcea3824565ce21202bca3,bangalore,572ca7ff116b5db3057bd814
300,605c320a9f725e223f8293f2,freshmenu,delivery,605ad6d577bf240d4569202a,bangalore,572ca7ff116b5db3057bd814
301,5f11349e3119fa0d308853c3,bfs,delivery,5f0415df46d6a968444d882d,bangalore,572ca7ff116b5db3057bd814
302,5e8a19c23c89412b94731fbc,c2c,c2c,5e8a15fe3c89412b94731fbb,bangalore,572ca7ff116b5db3057bd814
303,60b4938309c59b993ad673d0,dotpe (0-5km),delivery,60ab3da44abdf2bf93239886,bangalore,572ca7ff116b5db3057bd814


In [8]:
service_mapping_df.service_detail_id.count()/service_mapping_df.service_detail_id.nunique()

1.0

In [9]:
def get_captian_duration(yyyymmdd_from):
    
    hex_duration_qery = f"""
        select
            yyyymmdd,
            quarter_hour,
            captain_id,
            servicedetailid,
            location,
            sum(duration) duration
        from
            hive.datasets.supplycursory_history
        where 
            yyyymmdd = '{yyyymmdd_from}'
            -- and description = 'ready_for_ride'
            and status = 2
            and quarter_hour in ('0900', '0930', '0945', '1300')
        group by
            yyyymmdd, quarter_hour, captain_id, servicedetailid, location
    """

    hex_duration_df = pd.read_sql(hex_duration_qery, connection)
    
    return hex_duration_df


In [20]:
yyyymmdd_from

'20230403'

In [10]:
hex_duration_pan = get_captian_duration(yyyymmdd_from)

In [11]:
hex_duration_pan[
    (hex_duration_pan['location']=='883c138c91fffff') & 
    (hex_duration_pan['quarter_hour']=='0930')  & 
    (hex_duration_pan['servicedetailid']=='5f6f21fa59acf500076182b1') 
]

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration
147709,20230403,930,635a1391ce611c8d49fcc51e,5f6f21fa59acf500076182b1,883c138c91fffff,445
213518,20230403,930,6177b20e4c6ba117cfdf5ec6,5f6f21fa59acf500076182b1,883c138c91fffff,900
296235,20230403,930,5f0ad6ed1a58a130bfa26076,5f6f21fa59acf500076182b1,883c138c91fffff,360
574238,20230403,930,5faaa7ecdf5fb96bd76d0b94,5f6f21fa59acf500076182b1,883c138c91fffff,900


In [13]:
hex_duration_df1 =(
    hex_duration_pan
    .merge(
        service_mapping_df[
            ['service_detail_id', 'service_id', 'city', 'service_name', 'service_category']
        ],
        left_on='servicedetailid',
        right_on='service_detail_id',
        how='left'
    )
)


In [14]:
def get_duration_bucket(duration):
    if duration >0 and duration <= 30:
        return 0.5
    elif duration >30 and duration < 60:
        return .9
    else:
        return np.floor(duration/60)

In [82]:
get_duration_bucket(61)

1.0

In [16]:
hex_duration_df1['duration_bucket'] = hex_duration_df1.duration.apply(lambda x :get_duration_bucket(x))

### PAN

In [17]:
hex_duration_summ1 = (
    hex_duration_df1
    .groupby(['yyyymmdd', 'service_category', 'duration_bucket'])
    .agg(
        records=('captain_id', 'count')
        #captain_id_unique = ('captain_id','count')
    )
    .reset_index()
)

hex_duration_summ2 = (
    hex_duration_df1
    .groupby(['yyyymmdd', 'service_category'])
    .agg(
        total=('captain_id', 'count')
        #total_unique = ('captain_id','nunique')
    )
    .reset_index()
)


hex_duration_summary =  (
    hex_duration_summ1
    .merge(
        hex_duration_summ2,
        on=['yyyymmdd', 'service_category']
    )
)

hex_duration_summary['#distribution'] = hex_duration_summary['records'] * 100 / hex_duration_summary['total']

# hex_duration_summary['#distribution_captain'] = hex_duration_summary['records']*100/hex_duration_summary['total']

# hex_duration_summary.to_csv('hex_duration_summary_pan.csv',index=False)
hex_duration_summary


Unnamed: 0,yyyymmdd,service_category,duration_bucket,records,total,#distribution
0,20230403,auto,0.5,42623,310100,13.744921
1,20230403,auto,0.9,28571,310100,9.213480
2,20230403,auto,1.0,31493,310100,10.155756
3,20230403,auto,2.0,22345,310100,7.205740
4,20230403,auto,3.0,17494,310100,5.641406
...,...,...,...,...,...,...
63,20230403,link,11.0,3051,187744,1.625085
64,20230403,link,12.0,2908,187744,1.548918
65,20230403,link,13.0,3042,187744,1.620291
66,20230403,link,14.0,7530,187744,4.010781


### Cities

In [18]:
cities = ['chennai','bangalore','hyderabad','delhi','kolkata','jaipur']
service = ['link','auto']

hex_duration_cities = hex_duration_df1[(hex_duration_df1['city'].isin(cities))]


hex_duration_summ10 = hex_duration_cities.groupby(['yyyymmdd','city','service_category','duration_bucket']).agg(records = ('captain_id','count')).reset_index()

hex_duration_summ20 = hex_duration_cities.groupby(['yyyymmdd','city','service_category']).agg(total = ('captain_id','count')).reset_index()


hex_duration_summary1 =  hex_duration_summ10.merge(hex_duration_summ20,on = ['yyyymmdd','city','service_category'])

hex_duration_summary1['#distribution'] = hex_duration_summary1['records']*100/hex_duration_summary1['total']

hex_duration_summary1.to_csv('hex_duration_summary_cities.csv',index=False)
hex_duration_summary1

Unnamed: 0,yyyymmdd,city,service_category,duration_bucket,records,total,#distribution
0,20230403,bangalore,auto,0.5,20676,69100,29.921852
1,20230403,bangalore,auto,0.9,9392,69100,13.591896
2,20230403,bangalore,auto,1.0,9969,69100,14.426918
3,20230403,bangalore,auto,2.0,6223,69100,9.005789
4,20230403,bangalore,auto,3.0,4240,69100,6.136035
...,...,...,...,...,...,...,...
386,20230403,kolkata,link,11.0,326,14938,2.182354
387,20230403,kolkata,link,12.0,327,14938,2.189048
388,20230403,kolkata,link,13.0,386,14938,2.584014
389,20230403,kolkata,link,14.0,1196,14938,8.006427


# Get Raw Base data for Captain's selected for supply

In [19]:
def get_idle_duration(yyyymmdd_from):
    idle_duration_query = f"""
    with idle_supply as (
        select
            *
        from 
            (
                select 
                    *,
                    row_number() over(partition by yyyymmdd,quarter_hour, captain_id, servicedetailid order by duration desc) n_rank
                from 
                    (
                    select
                        yyyymmdd,
                        quarter_hour,
                        captain_id,
                        servicedetailid,
                        location,
                        sum(duration) duration
                    from
                        hive.datasets.supplycursory_history
                    where 
                        yyyymmdd = '{yyyymmdd_from}'
                        and quarter_hour in ('0900','0915','0930','0945', '1300')
                        and description = 'ready_for_ride'
                        group by 1,2,3,4,5
                    )
                )
                where n_rank = 1
            )

    select
        sch.yyyymmdd,
        sch.quarter_hour,
        sch.captain_id,
        sch.servicedetailid,
        sch.location,
        sum(sch.duration) duration
     from
        hive.datasets.supplycursory_history sch inner join 
        idle_supply 
        on sch.yyyymmdd = idle_supply.yyyymmdd
        and sch.quarter_hour = idle_supply.quarter_hour
        and sch.captain_id = idle_supply.captain_id
        and sch.servicedetailid = idle_supply.servicedetailid
        and sch.location = idle_supply.location
    where 
        sch.yyyymmdd = '{yyyymmdd_from}'
        -- and sch.quarter_hour = '0930'
        and sch.description = 'ready_for_ride'
    group by 
        1,2,3,4,5
    """
    
    idle_duration_df = pd.read_sql(idle_duration_query, connection)
    return idle_duration_df




### Supply Functions

In [20]:
def get_idle_captains(yyyymmdd_from):
    idle_captains_query = f"""
        select 
            *,
            row_number() over(partition by yyyymmdd,quarter_hour,captain_id,servicedetailid order by duration desc ) n_rank
        from 
            (
            select
                yyyymmdd,
                quarter_hour,
                captain_id,
                servicedetailid,
                location,
                sum(duration) duration
            from
                hive.datasets.supplycursory_history
            where 
                yyyymmdd = '{yyyymmdd_from}'
                and quarter_hour in ('0900','0915','0930','0945', '1300')
                and description = 'ready_for_ride'
                group by 1,2,3,4,5
            )

    """
    
    idle_captains_df = pd.read_sql(idle_captains_query, connection)
    return idle_captains_df


In [21]:
idle_captains_df = get_idle_captains(yyyymmdd_from)

In [22]:
idle_captains_df.head()

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank
0,20230403,900,5737d117ddbec222694efcaf,57370b61a6855d70057417d1,8861892543fffff,5,1
1,20230403,900,596513171554bc8337f82ec8,5ef2bc617b247cda76b202b0,886015ca4dfffff,896,1
2,20230403,900,5a2b6de950e9545d89a53d63,57370b61a6855d70057417d1,88618924e9fffff,18,1
3,20230403,900,5ad4461490d86a6ac857fbc8,5e07383b8222e14123103ca8,8860a2591dfffff,825,1
4,20230403,900,5b1e2ad29503a34a43db9f5d,5a6b238b5cecc70e41e35ac9,883c9301d9fffff,75,1


In [23]:
idle_captains_df[
    (idle_captains_df['captain_id'] == '573f290e9b0ffc2836776128') &
    (idle_captains_df['servicedetailid'] == '57370b61a6855d70057417d1')
]


Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank
464151,20230403,1300,573f290e9b0ffc2836776128,57370b61a6855d70057417d1,88618924bdfffff,121,1
787310,20230403,900,573f290e9b0ffc2836776128,57370b61a6855d70057417d1,88618925c1fffff,7,1
787311,20230403,900,573f290e9b0ffc2836776128,57370b61a6855d70057417d1,88618925c3fffff,4,2


In [24]:
idle_captains_df[(idle_captains_df['captain_id']=='621c4fc0c1a4e48d6f5fb512') &
                (idle_captains_df['servicedetailid']=='5f6f201a61f57a0007d2aac4') & 
                (idle_captains_df['quarter_hour']=='0945')
                ]

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank
244104,20230403,945,621c4fc0c1a4e48d6f5fb512,5f6f201a61f57a0007d2aac4,88618c4de1fffff,98,1
244105,20230403,945,621c4fc0c1a4e48d6f5fb512,5f6f201a61f57a0007d2aac4,88618c4de3fffff,57,2
244106,20230403,945,621c4fc0c1a4e48d6f5fb512,5f6f201a61f57a0007d2aac4,88618c4d13fffff,57,3
244107,20230403,945,621c4fc0c1a4e48d6f5fb512,5f6f201a61f57a0007d2aac4,88618c4de9fffff,45,4


In [25]:
idle_captains_df[idle_captains_df['location']=='883c138c91fffff']

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank
504,20230403,915,6177b20e4c6ba117cfdf5ec6,5f6f21fa59acf500076182b1,883c138c91fffff,892,1
20567,20230403,1300,5c8cdda15e042733c9cc218b,5dd3c33e5421741d9022ce3b,883c138c91fffff,722,1
26963,20230403,915,60e396fd09aa996cef409b51,5ab38dd30b74f15b1984ee09,883c138c91fffff,10,2
133325,20230403,900,62cc25ef5e405364ffc9cdba,5ab38dd30b74f15b1984ee09,883c138c91fffff,900,1
172835,20230403,900,6325fff80c010177d70a818f,5ab38dd30b74f15b1984ee09,883c138c91fffff,369,1
174623,20230403,930,635a1391ce611c8d49fcc51e,5f6f21fa59acf500076182b1,883c138c91fffff,445,1
183244,20230403,915,5e2a7fa6f5eca226ad2e4ef3,5ab38dd30b74f15b1984ee09,883c138c91fffff,67,1
230098,20230403,1300,62b3e7d07d796753b8f0d327,5f6341362049716607ba5903,883c138c91fffff,819,1
247499,20230403,900,5faaa7ecdf5fb96bd76d0b94,5f6f21fa59acf500076182b1,883c138c91fffff,900,1
305085,20230403,1300,62b3e7d07d796753b8f0d327,5ab38dd30b74f15b1984ee09,883c138c91fffff,819,1


In [26]:
idle_duration_city_service_df = (
    idle_captains_df
    .merge(
        service_mapping_df[
            [
                'service_detail_id', 'service_id', 'city',
                'service_name', 'service_category'
            ]
        ],
        left_on ='servicedetailid',
        right_on='service_detail_id',
        how = 'left'
    )
)

idle_duration_city_service_df['duration_bucket'] = (
    idle_duration_city_service_df.duration
    .apply(
        lambda x: get_duration_bucket(x)
    )
)


In [27]:
idle_duration_city_service_df.head()


Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank,service_detail_id,service_id,city,service_name,service_category,duration_bucket
0,20230403,900,5737d117ddbec222694efcaf,57370b61a6855d70057417d1,8861892543fffff,5,1,57370b61a6855d70057417d1,572e29b0116b5db3057bd821,bangalore,link,link,0.5
1,20230403,900,596513171554bc8337f82ec8,5ef2bc617b247cda76b202b0,886015ca4dfffff,896,1,5ef2bc617b247cda76b202b0,5bd6c6e2e79cc313a94728d0,mysore,auto,auto,14.0
2,20230403,900,5a2b6de950e9545d89a53d63,57370b61a6855d70057417d1,88618924e9fffff,18,1,57370b61a6855d70057417d1,572e29b0116b5db3057bd821,bangalore,link,link,0.5
3,20230403,900,5ad4461490d86a6ac857fbc8,5e07383b8222e14123103ca8,8860a2591dfffff,825,1,5e07383b8222e14123103ca8,5dbfee2121800750396f3892,hyderabad,rebel foods,delivery,13.0
4,20230403,900,5b1e2ad29503a34a43db9f5d,5a6b238b5cecc70e41e35ac9,883c9301d9fffff,75,1,5a6b238b5cecc70e41e35ac9,572e29b0116b5db3057bd821,vishakapatnam,link,link,1.0


In [28]:
idle_duration_city_service_rank_1_df = idle_duration_city_service_df[idle_duration_city_service_df['n_rank']==1]

idle_duration_service_sumry_df = (
    idle_duration_city_service_rank_1_df
    .groupby(['yyyymmdd', 'service_category', 'quarter_hour', 'duration_bucket'])
    .agg(
        records=('captain_id', 'count'),
        captain_count_dura_buck=('captain_id', 'nunique')
    )
    .reset_index()
)

idle_duration_sumry_df = (
    idle_duration_city_service_rank_1_df
    .groupby(['yyyymmdd', 'service_category', 'quarter_hour'])
    .agg(
        total=('captain_id', 'count'),
        captain_count_qh=('captain_id', 'nunique')
    )
    .reset_index()
)

idle_duration_summ =  (
    idle_duration_service_sumry_df
    .merge(
        idle_duration_sumry_df,
        on=['yyyymmdd', 'service_category', 'quarter_hour']
    )
)

idle_duration_summ['#distribution'] = idle_duration_summ['records']*100/idle_duration_summ['total']
# idle_duration_summ['#distribution_captain'] = idle_duration_summ['captain_count_dura_buck'] * \
#     100/idle_duration_summ['captain_count_qh']

# hex_duration_summary.to_csv('hex_duration_summary_pan.csv',index=False)
idle_duration_summ[idle_duration_summ['service_category']=='link']


Unnamed: 0,yyyymmdd,service_category,quarter_hour,duration_bucket,records,captain_count_dura_buck,total,captain_count_qh,#distribution,#distribution_captain
255,20230403,link,0900,0.5,7237,7226,34729,34678,20.838492,20.837419
256,20230403,link,0900,0.9,2534,2530,34729,34678,7.296496,7.295692
257,20230403,link,0900,1.0,3286,3278,34729,34678,9.461833,9.452679
258,20230403,link,0900,2.0,2500,2497,34729,34678,7.198595,7.200531
259,20230403,link,0900,3.0,2060,2058,34729,34678,5.931642,5.934598
...,...,...,...,...,...,...,...,...,...,...
335,20230403,link,1300,11.0,883,883,35093,35057,2.516171,2.518755
336,20230403,link,1300,12.0,838,835,35093,35057,2.387941,2.381835
337,20230403,link,1300,13.0,854,853,35093,35057,2.433534,2.433180
338,20230403,link,1300,14.0,2316,2312,35093,35057,6.599607,6.594974


In [40]:
idle_duration_summ.to_clipboard()


In [39]:
(
    idle_duration_summ
    # idle_duration_summ[idle_duration_summ['service_category'].isin(['link', 'auto'])]
    .pivot(
        index=['yyyymmdd', 'duration_bucket'],
        columns=['service_category', 'quarter_hour'],
        values='#distribution'
    )
    .reset_index()
).to_clipboard()



In [50]:


# idle_duration_service_sumry_df = (
#     idle_duration_city_service_df
#     .groupby(['yyyymmdd', 'service_category', 'duration_bucket'])
#     .agg(
#         records=('captain_id', 'count'),
#         captain_id_unique=('captain_id', 'count')
#     )
#     .reset_index()
# )

# idle_duration_sumry_df = idle_duration_city_service_df.groupby(['yyyymmdd','service_category']).agg(total = ('captain_id','count'),
#                                                                                   total_unique = ('captain_id','nunique')).reset_index()


# idle_duration_summ =  idle_duration_service_sumry_df.merge(idle_duration_sumry_df,on = ['yyyymmdd','service_category'])

# idle_duration_summ['#distribution'] = idle_duration_summ['records']*100/idle_duration_summ['total']
# idle_duration_summ['#distribution_captain'] = idle_duration_summ['records']*100/idle_duration_summ['total']

# # hex_duration_summary.to_csv('hex_duration_summary_pan.csv',index=False)
# idle_duration_summ[idle_duration_summ['service_category']=='link']


In [88]:
# https://docs.google.com/spreadsheets/d/1V1Tabi7chMonMBc6llh1GXpDZkdHiDDkWT6DobTRfOs/edit#gid=297367350

- if we are thinking to account min criteria to source the captains, any reason why we are choosing one hex which has highest 
- Short coming of this could be, if a captain is spending than less 1 minute in different hexs within the quarter hour, that will be discarded



In [30]:
idle_duration_city_service_df.query(
    "captain_id =='5e3dc5f46c87f100648eae16'"
)


Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank,service_detail_id,service_id,city,service_name,service_category,duration_bucket
225544,20230403,900,5e3dc5f46c87f100648eae16,5e993fee271f4612f176f2d2,8860b524b1fffff,59,1,5e993fee271f4612f176f2d2,5e8a15fe3c89412b94731fbb,hyderabad,c2c,c2c,0.9
307664,20230403,900,5e3dc5f46c87f100648eae16,5ba4c0076a3e3355b4bc31b0,8860b524b1fffff,59,1,5ba4c0076a3e3355b4bc31b0,58dcea3824565ce21202bca3,hyderabad,swiggy,delivery,0.9
395194,20230403,900,5e3dc5f46c87f100648eae16,574013f14fdf4798208bba26,8860b524b1fffff,59,1,574013f14fdf4798208bba26,572e29b0116b5db3057bd821,hyderabad,link,link,0.9


In [31]:
idle_duration_city_service_rank_1_df.query("duration_bucket <= 3")


Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,n_rank,service_detail_id,service_id,city,service_name,service_category,duration_bucket
0,20230403,0900,5737d117ddbec222694efcaf,57370b61a6855d70057417d1,8861892543fffff,5,1,57370b61a6855d70057417d1,572e29b0116b5db3057bd821,bangalore,link,link,0.5
2,20230403,0900,5a2b6de950e9545d89a53d63,57370b61a6855d70057417d1,88618924e9fffff,18,1,57370b61a6855d70057417d1,572e29b0116b5db3057bd821,bangalore,link,link,0.5
4,20230403,0900,5b1e2ad29503a34a43db9f5d,5a6b238b5cecc70e41e35ac9,883c9301d9fffff,75,1,5a6b238b5cecc70e41e35ac9,572e29b0116b5db3057bd821,vishakapatnam,link,link,1.0
5,20230403,0900,5b84187e841e3005f30f1817,60b4938309c59b993ad673d0,8861892e8dfffff,102,1,60b4938309c59b993ad673d0,60ab3da44abdf2bf93239886,bangalore,dotpe (0-5km),delivery,1.0
7,20230403,0900,5b84dd3b841e3005f30f26a9,5c7cca3b06bf824646cf0c55,883da1184bfffff,110,1,5c7cca3b06bf824646cf0c55,58dcea3824565ce21202bca3,delhi,swiggy,delivery,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
846184,20230403,1300,64174b5820601d26dc318fbc,6295cbb6706d2b398aca8f21,88608b5119fffff,67,1,6295cbb6706d2b398aca8f21,5f02bd38f10826055fe38f55,mumbai,magicpin food,delivery,1.0
846187,20230403,1300,64241ef92a070051c2973b05,574013f14fdf4798208bba26,8860a24869fffff,2,1,574013f14fdf4798208bba26,572e29b0116b5db3057bd821,hyderabad,link,link,0.5
846188,20230403,1300,6424466d1dfee2fbc87bc9ec,57370b61a6855d70057417d1,88618921c3fffff,49,1,57370b61a6855d70057417d1,572e29b0116b5db3057bd821,bangalore,link,link,0.9
846189,20230403,1300,64253c8f39fbd0f387ca5e9c,60b4946309c59b993ad673d2,88618c4195fffff,230,1,60b4946309c59b993ad673d2,60ab3da44abdf2bf93239886,chennai,dotpe (0-5km),delivery,3.0


In [34]:
def get_captain_e2o(start_yyyymmdd, connection):
    # query = """
    #     select 
    #         *
    #     from reports.sql_ingestion_captain_online_time_category_view
    #     where yyyymmdd = '{start_yyyymmdd}'
    #     -- and quarter_hour in ('0900','0915','0930','0945', '1300')
    # """.format(start_yyyymmdd=start_yyyymmdd)

    query = """
        select
            yyyymmdd,
            city,
            captain_id,
            location,
            e2o_category,
            e2o_sub_category,
            quarter_hour,
            break_point_cumsum,
            min(hhmm) as start_hhmm,
            max(hhmm) as end_hhmm,
            sum(good_ping_count) as good_ping_count,
            sum(bad_ping_count) as bad_ping_count,
            sum(duration) as duration
        from(
            select
                yyyymmdd,
                city,
                captain_id,
                location,
                quarter_hour,
                hhmm,
                service_name,
                e2o_category,
                e2o_sub_category,
                good_ping_count,
                bad_ping_count,
                duration,
                lag_e2o_sub_category,
                sum(case when e2o_sub_category = coalesce(lag_e2o_sub_category, e2o_sub_category) then 0 else 1 end) over(partition by yyyymmdd, captain_id order by hhmm) as break_point_cumsum
            from(
                select
                    yyyymmdd, city, captain_id,
                    location, quarter_hour, hhmm,
                    service_name, e2o_category,
                    e2o_sub_category,
                    good_ping_count, bad_ping_count,
                    duration,
                    lag(e2o_sub_category) over(partition by yyyymmdd, captain_id order by hhmm) as lag_e2o_sub_category
                from reports.sql_ingestion_captain_online_time_category_view
                where
                    yyyymmdd = '{start_yyyymmdd}'
                    and service_mode = 'Bike'
                )
                where quarter_hour in ('0900','0915','0930','0945', '1300')
            )
        group by 1, 2, 3, 4, 5, 6, 7, 8
    """.format(start_yyyymmdd='20230403')


    e2o_df = pd.read_sql_query(query, connection)

    return e2o_df


In [35]:
captain_e2o_df = get_captain_e2o(yyyymmdd_from, connection)

In [36]:
captain_e2o_df.head()

Unnamed: 0,yyyymmdd,city,captain_id,location,e2o_category,e2o_sub_category,quarter_hour,break_point_cumsum,start_hhmm,end_hhmm,good_ping_count,bad_ping_count,duration
0,20230403,Hyderabad,59f87f5706e22a5fe1f81835,8860b52ed3fffff,offline,offline,900,0,900,914,,,15
1,20230403,Hyderabad,59f87f5706e22a5fe1f81835,8860b52ed3fffff,offline,offline,915,0,915,929,,,15
2,20230403,Hyderabad,59f87f5706e22a5fe1f81835,8860b52ed3fffff,offline,offline,930,0,930,944,,,15
3,20230403,Hyderabad,59f87f5706e22a5fe1f81835,8860b52ed3fffff,offline,offline,945,0,945,959,,,15
4,20230403,Hyderabad,59f87f5706e22a5fe1f81835,8860b52ed3fffff,offline,offline,1300,0,1300,1314,,,15


In [47]:
below_3_min_idle_caps_df = (
    idle_duration_city_service_rank_1_df
    .query("duration_bucket <= 3 and service_name=='link'")
)

req_caps_df = (
    below_3_min_idle_caps_df[
        [
            'yyyymmdd', 'quarter_hour',
            'captain_id', 'servicedetailid',
            'service_name', 'city', 'duration_bucket'
        ]
    ].value_counts()
).reset_index()

req_caps_df.head()


Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,service_name,city,duration_bucket,0
0,20230403,900,5737d117ddbec222694efcaf,57370b61a6855d70057417d1,link,bangalore,0.5,1
1,20230403,945,5de78c4239cbc02e909863b8,574013f14fdf4798208bba26,link,hyderabad,0.5,1
2,20230403,945,5e4b7ef39df2ac339808602d,574013f14fdf4798208bba26,link,hyderabad,1.0,1
3,20230403,945,5e4b789fe6eea6a9e3956ea0,5da4660028af187d8d52cc3c,link,delhi,0.5,1
4,20230403,945,5e4b58207ea42530096ca870,5da4660028af187d8d52cc3c,link,delhi,0.9,1


In [56]:
req_caps_df.groupby(['captain_id']).agg(count_val=('quarter_hour', 'count')).sum()


count_val    90128
dtype: int64

In [50]:
e2o_cols_list = [
    'yyyymmdd', 'captain_id', 'quarter_hour',
    'location', 'e2o_category', 'e2o_sub_category',
    'start_hhmm', 'end_hhmm', 'duration'
]


In [84]:
captain_qh_offline_dura_df = (
    captain_e2o_df
    .query("e2o_category == 'offline'")
    .groupby(['yyyymmdd', 'captain_id', 'quarter_hour'])
    .agg(offline_duration=('duration', 'sum'))
    .reset_index()
)
captain_qh_offline_dura_df.head()


Unnamed: 0,yyyymmdd,captain_id,quarter_hour,offline_duration
0,20230403,5737c73addbec2203f733635,900,15
1,20230403,5737c73addbec2203f733635,915,15
2,20230403,5737c73addbec2203f733635,930,15
3,20230403,5737c73addbec2203f733635,945,15
4,20230403,5737c73addbec2203f733635,1300,15


In [85]:
req_caps_e2o_summary_df = (
    pd.merge(
        captain_qh_offline_dura_df,
        req_caps_df[['yyyymmdd', 'quarter_hour',
                    'captain_id', 'duration_bucket', 'city']],
        on=['yyyymmdd', 'quarter_hour', 'captain_id']    
    )
    .groupby(['yyyymmdd', 'quarter_hour', 'duration_bucket', 'offline_duration'])
    .agg(
        captain_having_offline_duration_count=('captain_id', 'nunique')
    )
    .reset_index()
).merge(
    idle_duration_service_sumry_df.query("service_category == 'link'"),
    on=['yyyymmdd', 'quarter_hour', 'duration_bucket']
)
req_caps_e2o_summary_df.head()


Unnamed: 0,yyyymmdd,quarter_hour,duration_bucket,offline_duration,captain_having_offline_duration_count,service_category,records,captain_count_dura_buck
0,20230403,900,0.5,1,168,link,7237,7226
1,20230403,900,0.5,2,178,link,7237,7226
2,20230403,900,0.5,3,141,link,7237,7226
3,20230403,900,0.5,4,174,link,7237,7226
4,20230403,900,0.5,5,174,link,7237,7226


In [76]:
# req_caps_e2o_summary_df[
#     (   
#         (req_caps_e2o_summary_df['duration_bucket' <= 0.9])
#         & (req_caps_e2o_summary_df['offline_duration' <= 14])
#     )
#     |
#     (
#         (req_caps_e2o_summary_df['duration_bucket' == 1])
#         & (req_caps_e2o_summary_df['offline_duration' <= 13])
#     )
#     |
#     (
#         (req_caps_e2o_summary_df['duration_bucket' == 2])
#         & (req_caps_e2o_summary_df['offline_duration' <= 12])
#     )
# ]


Unnamed: 0,yyyymmdd,service_category,quarter_hour,duration_bucket,records,captain_count_dura_buck
0,20230403,auto,900,0.5,4117,3213
1,20230403,auto,900,0.9,2204,1845
2,20230403,auto,900,1.0,3232,2832
3,20230403,auto,900,2.0,2728,2456
4,20230403,auto,900,3.0,2444,2229


In [80]:
req_caps_e2o_summary_df.to_clipboard()
