In [1]:
## Import libraries

import warnings
import pandas as pd
import numpy as np

from pyhive import presto
from h3 import h3
from IPython.core.interactiveshell import InteractiveShell
from datetime import date,datetime,timedelta

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [3]:
## Connection

connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike',
    )

In [4]:
## Filter

yyyymmdd_from = '20230508'
yyyymmdd_to = '20230512'

In [5]:
## datasets.supplycursory_history

def get_captian_duration(yyyymmdd_from):
    
    hex_duration_qery = f"""
        SELECT
            *
        FROM
        (
            SELECT
                *,
                row_number() OVER (PARTITION BY yyyymmdd,captain_id,quarter_hour ORDER BY idle_duration DESC) AS max_idle_duration
            FROM
            (
                SELECT
                    yyyymmdd,
                    captain_id,
                    hex_id,
                    quarter_hour,
                    SUM(CASE WHEN status = 'idle' THEN duration ELSE 0 END) AS idle_duration,
                    SUM(CASE WHEN status = 'diff_status' THEN duration ELSE 0 END) AS others_duration,
                    SUM(CASE WHEN status = 'offline' THEN duration ELSE 0 END) AS offline_duration,
                    MAX(CASE WHEN rnk=1 THEN status END) AS last_status
                    --COUNT(hex_id) OVER(PARTITION BY yyyymmdd,captain_id,quarter_hour) AS total_hex
                FROM
                (
                    SELECT 
                        *, 
                        row_number() OVER (PARTITION BY yyyymmdd,captain_id,hex_id,quarter_hour ORDER BY hhmm DESC) AS rnk
                    FROM
                    (
                    SELECT
                        yyyymmdd,
                        captain_id,
                        location AS hex_id,
                        quarter_hour,
                        CASE 
                            WHEN status IN (4,5,9) THEN 'offline' 
                            WHEN status IN (2) THEN 'idle' 
                            ELSE 'in_order' 
                        END AS status,
                        hhmm,
                        SUM(duration) AS duration
                    FROM
                        hive.datasets.supplycursory_history
                    WHERE 
                        yyyymmdd >= '{yyyymmdd_from}'
                        AND yyyymmdd <= '{yyyymmdd_to}'
                        AND servicedetailid = '5ef2bc5b85846b775f97d170'
                        AND substr(quarter_hour,1,2)  not in ('22','23','00','01','02','03','04','05')
                    GROUP BY 1,2,3,4,5,6
                    )
                    ORDER BY 1,2,3,4,5,6
                )
                GROUP BY 1,2,3,4
                ORDER BY 1,2,3,4
            )
        )
        WHERE 
            idle_duration > 0
            AND max_idle_duration = 1
    """

    hex_duration_df = pd.read_sql(hex_duration_qery, connection)
    
    return hex_duration_df

In [6]:
df_supplycursory_history_backup = get_captian_duration(yyyymmdd_from)
df_supplycursory_history = df_supplycursory_history_backup

DatabaseError: Execution failed on sql: 
        SELECT
            *
        FROM
        (
            SELECT
                *,
                row_number() OVER (PARTITION BY yyyymmdd,captain_id,quarter_hour ORDER BY idle_duration DESC) AS max_idle_duration
            FROM
            (
                SELECT
                    yyyymmdd,
                    captain_id,
                    hex_id,
                    quarter_hour,
                    SUM(CASE WHEN status = 'idle' THEN duration ELSE 0 END) AS idle_duration,
                    SUM(CASE WHEN status = 'diff_status' THEN duration ELSE 0 END) AS others_duration,
                    SUM(CASE WHEN status = 'offline' THEN duration ELSE 0 END) AS offline_duration,
                    MAX(CASE WHEN rnk=1 THEN status END) AS last_status
                    --COUNT(hex_id) OVER(PARTITION BY yyyymmdd,captain_id,quarter_hour) AS total_hex
                FROM
                (
                    SELECT 
                        *, 
                        row_number() OVER (PARTITION BY yyyymmdd,captain_id,hex_id,quarter_hour ORDER BY hhmm DESC) AS rnk
                    FROM
                    (
                    SELECT
                        yyyymmdd,
                        captain_id,
                        location AS hex_id,
                        quarter_hour,
                        CASE 
                            WHEN status IN (4,5,9) THEN 'offline' 
                            WHEN status IN (2) THEN 'idle' 
                            ELSE 'in_order' 
                        END AS status,
                        hhmm,
                        SUM(duration) AS duration
                    FROM
                        hive.datasets.supplycursory_history
                    WHERE 
                        yyyymmdd >= '20230508'
                        AND yyyymmdd <= '20230512'
                        AND servicedetailid = '5ef2bc5b85846b775f97d170'
                        AND substr(quarter_hour,1,2)  not in ('22','23','00','01','02','03','04','05')
                    GROUP BY 1,2,3,4,5,6
                    )
                    ORDER BY 1,2,3,4,5,6
                )
                GROUP BY 1,2,3,4
                ORDER BY 1,2,3,4
            )
        )
        WHERE 
            idle_duration > 0
            AND max_idle_duration = 1
    
HTTPConnectionPool(host='presto-gateway.serving.data.production.internal', port=80): Max retries exceeded with url: /v1/statement (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fdace2cde40>, 'Connection to presto-gateway.serving.data.production.internal timed out. (connect timeout=None)'))
unable to rollback

In [None]:
df_supplycursory_history.head()

In [None]:
### Outliers

df_supplycursory_history['total_duration'] = df_supplycursory_history['idle_duration'] + df_supplycursory_history['others_duration'] +df_supplycursory_history['offline_duration'] 
df_supplycursory_history = df_supplycursory_history[df_supplycursory_history['total_duration'] <= 900]

In [None]:
df_1 = df_supplycursory_history

In [None]:
## AOR data

def get_order_logs_immutable(yyyymmdd_from):
    
    order_logs_immutable = f"""
        WITH order_stats AS (

            SELECT
                yyyymmdd,
                captain_id,
                pickup_location_hex_8 AS hex_id,
                quarter_hour,
                COUNT(DISTINCT(order_id)) AS order_requested,
                COUNT(DISTINCT(CASE WHEN event_type = 'accepted' THEN order_id END)) AS accepted_orders
            FROM
                (    
                SELECT 
                    date_format(date_parse(yyyymmdd, '%Y%m%d'),'%W') AS week_period,
                    yyyymmdd,
                    captain_id,
                    quarter_hour,
                    order_id,
                    city_name,
                    order_status,
                    event_type,
                    pickup_location_hex_8,
                    captain_location_hex_8
                FROM 
                    orders.order_logs_immutable
                WHERE 
                    yyyymmdd >= '{yyyymmdd_from}'
                    AND yyyymmdd <= '{yyyymmdd_to}'
                    AND service_detail_id = '5ef2bc5b85846b775f97d170'
                    AND city_name = 'Hyderabad'
                    AND service_obj_service_name = 'Auto'
                )
            GROUP BY 1,2,3,4
        )

        SELECT * FROM order_stats
    """

    order_logs_immutable_df = pd.read_sql(order_logs_immutable, connection)
    
    return order_logs_immutable_df

In [None]:
df_order_logs_immutable_backup = get_order_logs_immutable(yyyymmdd_from)
df_order_logs_immutable = df_order_logs_immutable_backup

In [None]:
df_order_logs_immutable.head()

In [None]:
## Join Idle time & AOR

df_merge = pd.merge(df_1, df_order_logs_immutable,  how='left', 
                    left_on=['yyyymmdd','captain_id','hex_id','quarter_hour'], 
                    right_on = ['yyyymmdd','captain_id','hex_id','quarter_hour'])
df_merge = df_merge.fillna(0)
df_merge

In [None]:
# df_merge['time_bucket'] = 	np.where(df_merge['quarter_hour'].str[:2].isin(['00','01','02','03','04','05','06','07']) , 'rest_morning', 	
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['08','09','10','11']) , 'morning_peak', 	
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['12','13','14','15','16']) , 'afternoon', 
#                            np.where(df_merge['quarter_hour'].str[:2].isin(['17','18','19','20','21']) , 'evening_peak', 'rest_evening'))))
#

In [None]:
df_2 = df_merge.groupby(['yyyymmdd','quarter_hour']).agg({'captain_id':'count', 'order_requested':'sum', 'accepted_orders':'sum' }).reset_index()
df_2

In [None]:
df_quarter_hour_total_captains = df_1.groupby(['yyyymmdd','quarter_hour']).captain_id.nunique().reset_index(name='all_captains')
df_quarter_hour_total_captains.head()

In [None]:
df_refined_data = pd.merge(df_2, df_quarter_hour_total_captains,  how='left', 
                    left_on=['yyyymmdd', 'quarter_hour'], 
                    right_on = ['yyyymmdd', 'quarter_hour'])
df_refined_data

In [None]:
df_refined_data['distribution'] = 100*df_refined_data['captain_id']/df_refined_data['all_captains']
df_refined_data['aor'] = 100*df_refined_data['accepted_orders']/df_refined_data['order_requested']
df_refined_data.fillna(0)

In [None]:
df_refined_data['time_bucket'] = 	np.where(df_refined_data['quarter_hour'].str[:2].isin(['00','01','02','03','04','05','06','07']) , 'rest_morning', 	
                            np.where(df_refined_data['quarter_hour'].str[:2].isin(['08','09','10','11']) , 'morning_peak', 	
                            np.where(df_refined_data['quarter_hour'].str[:2].isin(['12','13','14','15','16']) , 'afternoon', 
                            np.where(df_refined_data['quarter_hour'].str[:2].isin(['17','18','19','20','21']) , 'evening_peak', 'rest_evening'))))
df_refined_data

In [None]:
df_v1 = df_refined_data.groupby(['yyyymmdd']).agg({'distribution':'mean', 'aor':'mean'}).reset_index()
df_v1

In [None]:
df_v2 = df_refined_data.groupby(['yyyymmdd','time_bucket']).agg({'distribution':'mean', 'aor':'mean'}).reset_index()
df_v2.head()

In [None]:
df_v3 = pd.pivot_table(df_refined_data, values= ['aor'] , index=['time_bucket'],
                       columns=['yyyymmdd']).reset_index()
df_v3

In [None]:
##df_refined_data[df_refined_data['quarter_hour'] == '0900']

In [None]:
# df_refined_data.to_csv("quarter_hour_20230514.csv", index= False)