### Import libraries 

In [None]:
import warnings
import pandas as pd
import numpy as np

from pyhive import presto
from h3 import h3
from IPython.core.interactiveshell import InteractiveShell
from datetime import date,datetime,timedelta

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

In [2]:
connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike',
    )

In [3]:
# Date range parameter

yyyymmdd_from = '20230508'
yyyymmdd_to   = '20230508'

## Reading dataset service_mapping

In [4]:
def get_service_mapping():
    
    service_mapping_query = f"""
    SELECT service_detail_id,
            service_level as service_name,
            service_category,
            service_id,
            city_display_name as city,
            city_id
    FROM hive.datasets.service_mapping
    WHERE service_level = 'Auto'
    """
    # Filtered for Auto service
    
    service_mapping_df = pd.read_sql(service_mapping_query, connection)
    return service_mapping_df

service_mapping_df = get_service_mapping()

DatabaseError: Execution failed on sql: 
    SELECT service_detail_id,
            service_level as service_name,
            service_category,
            service_id,
            city_display_name as city,
            city_id
    FROM hive.datasets.service_mapping
    WHERE service_level = 'Auto'
    
HTTPConnectionPool(host='presto-gateway.serving.data.production.internal', port=80): Max retries exceeded with url: /v1/statement (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8b45af18d0>: Failed to establish a new connection: [Errno 60] Operation timed out'))
unable to rollback

In [5]:
service_mapping_df.head()

Unnamed: 0,service_detail_id,service_name,service_category,service_id,city,city_id
0,5f73564ba90a3e0007c034aa,Auto,auto,5bd6c6e2e79cc313a94728d0,Chandigarh,5bc5adaac07a7c2eaff6897c
1,5ef2bc617b247cda76b202b0,Auto,auto,5bd6c6e2e79cc313a94728d0,Mysore,58bd0560b866a73f64d89dff
2,62974b2659bcb27dda93ebb2,Auto,auto,5bd6c6e2e79cc313a94728d0,Gwalior,5c45a4aae0dfdf79cd6b6f86
3,62c423de91efab02c4b4a727,Auto,auto,5bd6c6e2e79cc313a94728d0,Dehradun,5c32f0b6f0c2c508a5be30cd
4,62beb554204efbe7d5148d61,Auto,auto,5bd6c6e2e79cc313a94728d0,Warangal,5c134f0c7a6187300d0c0f9d


In [6]:
service_mapping_df.query("city == 'Hyderabad'")

Unnamed: 0,service_detail_id,service_name,service_category,service_id,city,city_id
10,5ef2bc5b85846b775f97d170,Auto,auto,5bd6c6e2e79cc313a94728d0,Hyderabad,5740135d4fdf4798208bba24


In [7]:
service_mapping_df.service_detail_id.count()/service_mapping_df.service_detail_id.nunique()

1.0

In [8]:
yyyymmdd_from

'20230508'

In [9]:
def get_captian_duration(yyyymmdd_from):
    
    hex_duration_qery = f"""
        select
            yyyymmdd,
            quarter_hour,
            captain_id,
            servicedetailid,
            location,
            sum(duration) duration
        from
            hive.datasets.supplycursory_history
        where 
            yyyymmdd = '{yyyymmdd_from}'
            -- and description = 'ready_for_ride'
            and status = 2
            and quarter_hour in ('0900', '0915', '0930', '0945', '1300', '1315', '1330', '1345')
        group by
            yyyymmdd, quarter_hour, captain_id, servicedetailid, location
    """

    hex_duration_df = pd.read_sql(hex_duration_qery, connection)
    
    return hex_duration_df

In [10]:
hex_duration_pan = get_captian_duration(yyyymmdd_from)

In [11]:
# Sample data filter 

hex_duration_pan[
    (hex_duration_pan['location']=='8860b19719fffff') & 
    (hex_duration_pan['quarter_hour']=='0930')  & 
    (hex_duration_pan['servicedetailid']=='5ef2bc5b85846b775f97d170') 
]

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration
73709,20230508,930,607a679541fc435367500139,5ef2bc5b85846b775f97d170,8860b19719fffff,53
508788,20230508,930,5c7c0166875c9853625fd083,5ef2bc5b85846b775f97d170,8860b19719fffff,12
837822,20230508,930,5e16e83e2f1e6d5f59977dbd,5ef2bc5b85846b775f97d170,8860b19719fffff,411


In [12]:
hex_duration_df1 =(
    hex_duration_pan
    .merge(
        service_mapping_df[
            ['service_detail_id', 'service_id', 'city', 'service_name', 'service_category']
        ],
        left_on='servicedetailid',
        right_on='service_detail_id',
        how='left'
    )
)

hex_duration_df1.head()

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,service_detail_id,service_id,city,service_name,service_category
0,20230508,930,61b1ca3772f1dd603491d39a,60fed3a85d74ca03f9e71ef2,88608b0b3bfffff,31,60fed3a85d74ca03f9e71ef2,5bd6c6e2e79cc313a94728d0,Mumbai,Auto,auto
1,20230508,930,61c49d1b17dd6a4d1f1c066d,60fed3a85d74ca03f9e71ef2,88608b0969fffff,45,60fed3a85d74ca03f9e71ef2,5bd6c6e2e79cc313a94728d0,Mumbai,Auto,auto
2,20230508,930,620cd38657d2b22184b0f9f5,60fed3a85d74ca03f9e71ef2,88608b5721fffff,594,60fed3a85d74ca03f9e71ef2,5bd6c6e2e79cc313a94728d0,Mumbai,Auto,auto
3,20230508,930,631bd7eebc0773fec3c4e59a,5da4660028af187d8d52cc3c,883da1062bfffff,41,,,,,
4,20230508,930,5e5e0c91bb61de2e6fb4f2f7,5da4660028af187d8d52cc3c,883da111abfffff,236,,,,,


In [13]:
def get_duration_bucket(duration):
    if duration >0 and duration <= 30:
        return 0.5
    elif duration >30 and duration < 60:
        return .9
    else:
        return np.floor(duration/60)

In [14]:
hex_duration_df1['duration_bucket'] = hex_duration_df1.duration.apply(lambda x :get_duration_bucket(x))
hex_duration_df1.head()

Unnamed: 0,yyyymmdd,quarter_hour,captain_id,servicedetailid,location,duration,service_detail_id,service_id,city,service_name,service_category,duration_bucket
0,20230508,930,61b1ca3772f1dd603491d39a,60fed3a85d74ca03f9e71ef2,88608b0b3bfffff,31,60fed3a85d74ca03f9e71ef2,5bd6c6e2e79cc313a94728d0,Mumbai,Auto,auto,0.9
1,20230508,930,61c49d1b17dd6a4d1f1c066d,60fed3a85d74ca03f9e71ef2,88608b0969fffff,45,60fed3a85d74ca03f9e71ef2,5bd6c6e2e79cc313a94728d0,Mumbai,Auto,auto,0.9
2,20230508,930,620cd38657d2b22184b0f9f5,60fed3a85d74ca03f9e71ef2,88608b5721fffff,594,60fed3a85d74ca03f9e71ef2,5bd6c6e2e79cc313a94728d0,Mumbai,Auto,auto,9.0
3,20230508,930,631bd7eebc0773fec3c4e59a,5da4660028af187d8d52cc3c,883da1062bfffff,41,,,,,,0.9
4,20230508,930,5e5e0c91bb61de2e6fb4f2f7,5da4660028af187d8d52cc3c,883da111abfffff,236,,,,,,3.0


## PAN India

#### Without Quarter hour

In [15]:
hex_duration_summ1 = (
    hex_duration_df1
    .groupby(['yyyymmdd', 'service_category', 'duration_bucket'])
    .agg(
        records=('captain_id', 'count')
        #captain_id_unique = ('captain_id','count')
    )
    .reset_index()
)

hex_duration_summ2 = (
    hex_duration_df1
    .groupby(['yyyymmdd', 'service_category'])
    .agg(
        total=('captain_id', 'count')
        #total_unique = ('captain_id','nunique')
    )
    .reset_index()
)


hex_duration_summary =  (
    hex_duration_summ1
    .merge(
        hex_duration_summ2,
        on=['yyyymmdd', 'service_category']
    )
)

hex_duration_summary['#distribution'] = hex_duration_summary['records'] * 100 / hex_duration_summary['total']

# hex_duration_summary['#distribution_captain'] = hex_duration_summary['records']*100/hex_duration_summary['total']

# hex_duration_summary.to_csv('hex_duration_summary_pan.csv',index=False)
hex_duration_summary.head()

Unnamed: 0,yyyymmdd,service_category,duration_bucket,records,total,#distribution
0,20230508,auto,0.5,70761,477130,14.830549
1,20230508,auto,0.9,47866,477130,10.032067
2,20230508,auto,1.0,50911,477130,10.670258
3,20230508,auto,2.0,36820,477130,7.716974
4,20230508,auto,3.0,28574,477130,5.988724


In [16]:
hex_duration_summary.pivot(index ='duration_bucket', columns ='service_category', values =['#distribution'])

Unnamed: 0_level_0,#distribution
service_category,auto
duration_bucket,Unnamed: 1_level_2
0.5,14.830549
0.9,10.032067
1.0,10.670258
2.0,7.716974
3.0,5.988724
4.0,4.86555
5.0,4.096368
6.0,3.420451
7.0,3.009662
8.0,2.660281


#### With Quarter hour

In [17]:
hex_duration_summ11 = (
    hex_duration_df1
    .groupby(['yyyymmdd', 'service_category', 'quarter_hour', 'duration_bucket'])
    .agg(
        records=('captain_id', 'count')
        #captain_id_unique = ('captain_id','count')
    )
    .reset_index()
)

hex_duration_summ22 = (
    hex_duration_df1
    .groupby(['yyyymmdd', 'service_category', 'quarter_hour'])
    .agg(
        total=('captain_id', 'count')
        #total_unique = ('captain_id','nunique')
    )
    .reset_index()
)


hex_duration_summaryy =  (
    hex_duration_summ11
    .merge(
        hex_duration_summ22,
        on=['yyyymmdd', 'service_category', 'quarter_hour']
    )
)

hex_duration_summaryy['#distribution'] = hex_duration_summaryy['records'] * 100 / hex_duration_summaryy['total']

hex_duration_summaryy.head()

Unnamed: 0,yyyymmdd,service_category,quarter_hour,duration_bucket,records,total,#distribution
0,20230508,auto,900,0.5,9881,56071,17.6223
1,20230508,auto,900,0.9,6205,56071,11.066327
2,20230508,auto,900,1.0,6355,56071,11.333845
3,20230508,auto,900,2.0,4327,56071,7.717002
4,20230508,auto,900,3.0,3371,56071,6.01202


In [18]:
hex_duration_summaryy.pivot(index ='duration_bucket', columns ='quarter_hour', values =['#distribution'])

Unnamed: 0_level_0,#distribution,#distribution,#distribution,#distribution,#distribution,#distribution,#distribution,#distribution
quarter_hour,0900,0915,0930,0945,1300,1315,1330,1345
duration_bucket,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.5,17.6223,18.346643,18.589404,18.020738,11.746664,11.626632,11.673671,11.727198
0.9,11.066327,11.010092,11.524475,11.09747,8.567318,8.961793,9.008653,9.286281
1.0,11.333845,11.540149,11.669598,11.799253,9.438437,9.50669,10.149155,10.136469
2.0,7.717002,7.99649,8.065425,8.258814,7.182382,7.343221,7.36712,7.872674
3.0,6.01202,5.995612,6.091752,6.30112,5.786378,5.900371,5.908529,5.931074
4.0,4.649462,4.889864,4.947841,4.960597,4.731866,5.000806,4.99382,4.738449
5.0,4.094808,3.903466,3.797101,4.056408,4.225953,4.186684,4.187886,4.293112
6.0,3.426013,3.240018,3.349781,3.361261,3.463922,3.451556,3.566543,3.490157
7.0,3.217349,2.750329,2.759045,2.992949,3.043382,3.077543,3.13803,3.088679
8.0,2.473649,2.550241,2.458555,2.453754,2.81414,2.895373,2.829831,2.768172


## Cities - Hyderabad

In [19]:
# Date range parameter

cities = ['Hyderabad']
service = ['Auto']

hex_duration_cities = hex_duration_df1[(hex_duration_df1['city'].isin(cities))]

#### Without Quarter hour

In [20]:
hex_duration_summ10 = hex_duration_cities.groupby(['yyyymmdd','city','service_category','duration_bucket']).agg(records = ('captain_id','count')).reset_index()

hex_duration_summ20 = hex_duration_cities.groupby(['yyyymmdd','city','service_category']).agg(total = ('captain_id','count')).reset_index()


hex_duration_summary1 =  hex_duration_summ10.merge(hex_duration_summ20,on = ['yyyymmdd','city','service_category'])

hex_duration_summary1['#distribution'] = hex_duration_summary1['records']*100/hex_duration_summary1['total']

hex_duration_summary1.head()

Unnamed: 0,yyyymmdd,city,service_category,duration_bucket,records,total,#distribution
0,20230508,Hyderabad,auto,0.5,19944,68674,29.041559
1,20230508,Hyderabad,auto,0.9,9893,68674,14.405743
2,20230508,Hyderabad,auto,1.0,9860,68674,14.35769
3,20230508,Hyderabad,auto,2.0,6079,68674,8.851967
4,20230508,Hyderabad,auto,3.0,4382,68674,6.380872


In [21]:
hex_duration_summary1.pivot(index ='duration_bucket', columns ='service_category', values =['#distribution'])

Unnamed: 0_level_0,#distribution
service_category,auto
duration_bucket,Unnamed: 1_level_2
0.5,29.041559
0.9,14.405743
1.0,14.35769
2.0,8.851967
3.0,6.380872
4.0,4.607275
5.0,3.564668
6.0,2.839503
7.0,2.203163
8.0,1.855142


#### With Quarter hour

In [22]:
hex_duration_summ101 = hex_duration_cities.groupby(['yyyymmdd','city','service_category', 'quarter_hour', 'duration_bucket']).agg(records = ('captain_id','count')).reset_index()

hex_duration_summ202 = hex_duration_cities.groupby(['yyyymmdd','city','service_category', 'quarter_hour']).agg(total = ('captain_id','count')).reset_index()


hex_duration_summary11 =  hex_duration_summ101.merge(hex_duration_summ202,on = ['yyyymmdd','city','service_category', 'quarter_hour'])

hex_duration_summary11['#distribution'] = hex_duration_summary11['records']*100/hex_duration_summary11['total']

hex_duration_summary11.head()

Unnamed: 0,yyyymmdd,city,service_category,quarter_hour,duration_bucket,records,total,#distribution
0,20230508,Hyderabad,auto,900,0.5,2338,9395,24.885577
1,20230508,Hyderabad,auto,900,0.9,1354,9395,14.411921
2,20230508,Hyderabad,auto,900,1.0,1370,9395,14.582225
3,20230508,Hyderabad,auto,900,2.0,857,9395,9.121873
4,20230508,Hyderabad,auto,900,3.0,648,9395,6.897286


In [23]:
hex_duration_summary11.pivot(index ='duration_bucket', columns ='quarter_hour', values =['#distribution'])

Unnamed: 0_level_0,#distribution,#distribution,#distribution,#distribution,#distribution,#distribution,#distribution,#distribution
quarter_hour,0900,0915,0930,0945,1300,1315,1330,1345
duration_bucket,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.5,24.885577,26.76072,27.01953,26.885922,33.226994,31.326375,31.664813,32.105594
0.9,14.411921,13.90802,14.414115,14.544665,14.171779,14.498473,14.324558,15.021999
1.0,14.582225,14.663408,14.4585,14.425269,14.04908,13.836558,15.12792,13.601508
2.0,9.121873,8.92024,9.098979,8.943884,8.110429,8.655804,8.602151,9.277184
3.0,6.897286,6.654077,6.757656,6.610225,5.96319,6.300916,5.981955,5.681961
4.0,5.013305,5.254388,4.582779,4.710735,3.631902,4.620672,4.399951,4.500314
5.0,3.874401,3.52144,3.750555,3.820688,3.312883,3.538697,3.299963,3.293526
6.0,3.235764,3.065985,2.896138,2.898079,2.588957,2.647658,2.422445,2.853551
7.0,2.362959,2.28838,2.374612,2.680994,2.03681,1.960285,1.903349,1.885607
8.0,2.075572,1.977338,1.986241,1.758385,1.558282,1.807536,1.792115,1.835324
