In [8]:
import os
import h3 as h3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as datetime

from scipy import stats
from pyhive import presto
from keplergl import KeplerGl
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [9]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)

In [10]:
## Connection
connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike'
)## Parameter 

## Parameter 

In [11]:
## Parameter 
start_date = '2023-10-01'
end_date = '2023-10-31'

In [49]:
## first ride customers 

first_ride_customers = f"""

    with all_customers as (

        select 
            customer_id as customerid,
            date_format(taxi_lifetime_first_ride_date,'%Y%m%d') as day
        from 
            datasets.iallocator_customer_segments
        where 
            date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') >= date_format(DATE('{start_date}'),'%Y-%m-%d')
            and date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') <= date_format(DATE('{end_date}'),'%Y-%m-%d')
            and run_date = date_format(DATE('{end_date}') + interval '1' day,'%Y-%m-%d')
            
        group by 1,2
        ),

        orders_tbl as (
        
        SELECT
            orderdate,
            city,
            service_name,
            estimate_id, 
            fare_recalculated_reason,
            customer_id,
            order_id,
            order_amount,
            fe_re_type,
            fe_re_diff_amount,
            u.id_array as fare_estimate_id
        FROM
            (
            SELECT   
                city_name as city,
                service_obj_service_name as service_name,
                customer_id,
                order_id,
                amount as order_amount,
                date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
                epoch,
                estimate_id,
                estimate_ids,
                fare_recalculated_reason,
                fare_recalculated_type AS fe_re_type,
                fare_recalculated_diff_amount AS fe_re_diff_amount,
                row_number() over(partition by customer_id order by epoch) as net_ride_number,
                cast(json_parse(estimate_ids) as array<varchar>) AS id_array
            FROM
                orders.order_logs_snapshot
            WHERE
                yyyymmdd >= date_format(DATE('{start_date}'),'%Y%m%d')
                AND yyyymmdd <= date_format(DATE('{end_date}'),'%Y%m%d')
                AND order_status = 'dropped'
                AND (spd_fraud_flag is null or spd_fraud_flag=false)
                AND customer_id in (select customerid from all_customers)
                AND service_obj_service_name IN ('Link', 'Auto', 'Auto Pool', 'CabEconomy')

            ) as t
        CROSS JOIN UNNEST(t.id_array) AS u(id_array)
        
        WHERE  
            net_ride_number in (1)
        ),
        
        
        fe_tbl as (
        SELECT
            user_id as customer_id,
            fare_estimate_id,
            service_name,
            api_context,
            epoch,
            date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
            cast(final_amount AS double) AS fe_amount
        FROM
            pricing.fare_estimates_enriched
        WHERE
            yyyymmdd >= date_format(DATE('{start_date}'),'%Y%m%d')
            AND yyyymmdd <= date_format(DATE('{end_date}'),'%Y%m%d')
            AND service_name IN ('Link', 'Auto', 'Auto Pool', 'CabEconomy')
            AND user_id in (select customerid from all_customers)
        ),

        merged_tbl as (
        
        SELECT 
            *,
            CASE 
            WHEN amt_diff < -10 THEN '< -10'
            WHEN amt_diff >= -10 AND amt_diff < -7 THEN '-10 to -7'
            WHEN amt_diff >= -7 AND amt_diff < -5 THEN '-7 to -5'
            WHEN amt_diff >= -5 AND amt_diff < 0 THEN '-5 to 0'
            WHEN amt_diff > 0 AND amt_diff <= 5 THEN '0 to 5'
            WHEN amt_diff > 5 AND amt_diff <= 7 THEN '5 to 7'
            WHEN amt_diff > 7 AND amt_diff <= 10 THEN '7 to 10'
            WHEN amt_diff > 10 THEN '> 10'
            END AS amt_diff_bucket
        FROM
        (
        
        SELECT  
            o.orderdate,
            o.service_name,
            o.customer_id,
            o.order_id,
            o.estimate_id,
            api_context,
            o.fare_estimate_id,
            o.fare_recalculated_reason,
            fe_amount,
            o.order_amount,
            o.order_amount -  fe_amount AS amt_diff,
            fe_re_type,
            fe_re_diff_amount,
            row_number() over (partition by o.estimate_id order by f.epoch asc)  as updated_seq 
        From orders_tbl o
        INNER JOIN fe_tbl f
        ON o.fare_estimate_id = f.fare_estimate_id
        AND o.service_name = f.service_name
        )
        ),
        
        total_orders AS (
        
        SELECT
            min(orderdate) || ' to ' || max(orderdate) date_range,
            service_name,
            COUNT(DISTINCT order_id) total_orders,
            COUNT(DISTINCT customer_id) total_customer
        FROM
            orders_tbl
        GROUP BY 2
        ),
        
        price_shock AS (    
    
        SELECT
                service_name,
                'fare editOrder' AS price_shock_type,
                MIN(orderdate) || ' to ' || MAX(orderdate) date_range,
                COUNT(DISTINCT order_id) as total_shocked_orders,
                COUNT(DISTINCT customer_id) total_shocked_customer,

                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '< -10' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (< -10)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '-10 to -7' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (-10 to -7)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '-7 to -5' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (-7 to -5)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '-5 to 0' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (-5 to 0)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '0 to 5' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (0 to 5)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '5 to 7' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (5 to 7)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '7 to 10' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (7 to 10)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '> 10' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (10 <)"

        FROM 
            merged_tbl
        WHERE 
            order_id in (SELECT order_id from merged_tbl where  amt_diff!=0)
            AND order_id in (SELECT order_id from merged_tbl where api_context = '/fare/editOrder')
            AND updated_seq = 1
        GROUP BY 1

        UNION 

        SELECT
                service_name,
                'non-fare editOrder' AS price_shock_type,
                MIN(orderdate) || ' to ' || MAX(orderdate) date_range,
                COUNT(DISTINCT order_id) as total_shocked_orders,
                COUNT(DISTINCT customer_id) total_shocked_customer,

                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '< -10' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (< -10)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '-10 to -7' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (-10 to -7)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '-7 to -5' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (-7 to -5)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '-5 to 0' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (-5 to 0)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '0 to 5' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (0 to 5)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '5 to 7' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (5 to 7)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '7 to 10' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (7 to 10)",
                COALESCE(TRY(COUNT(DISTINCT CASE WHEN amt_diff_bucket = '> 10' THEN order_id END)*100.0/COUNT(DISTINCT order_id)), 0)  "amt (10 <)"

        FROM 
            merged_tbl
        WHERE 
            order_id in (SELECT order_id from merged_tbl where  amt_diff!=0)
            AND order_id not in (SELECT order_id from merged_tbl where api_context = '/fare/editOrder')
            AND updated_seq = 1
        GROUP BY 1
        )

    SELECT
        t.date_range,
        ps.price_shock_type,
        t.service_name,
        t.total_orders,
        total_customer,
        total_shocked_orders,
        total_shocked_customer,
        TRY(total_shocked_orders*100.0/total_orders) "shocked_orders%",
        "amt (< -10)",
        "amt (-10 to -7)",
        "amt (-7 to -5)",
        "amt (-5 to 0)",
        "amt (0 to 5)",
        "amt (5 to 7)",
        "amt (7 to 10)",
        "amt (10 <)"

    FROM 
        total_orders AS t

    JOIN 
        price_shock AS ps 
        ON t.date_range = ps.date_range
        AND t.service_name = ps.service_name

    ORDER BY 2 DESC, 3
"""


df_first_ride_customers = pd.read_sql(first_ride_customers, connection)
df_first_ride_customers

Unnamed: 0,date_range,price_shock_type,service_name,total_orders,total_customer,total_shocked_orders,total_shocked_customer,shocked_orders%,amt (< -10),amt (-10 to -7),amt (-7 to -5),amt (-5 to 0),amt (0 to 5),amt (5 to 7),amt (7 to 10),amt (10 <)
0,2023-10-01 to 2023-10-31,non-fare editOrder,Auto,442270,442270,148274,148274,33.5,18.9,7.1,3.1,3.1,15.2,4.1,9.8,38.7
1,2023-10-01 to 2023-10-31,non-fare editOrder,CabEconomy,7305,7305,2238,2238,30.6,4.1,0.9,0.6,2.1,17.3,8.1,9.9,57.0
2,2023-10-01 to 2023-10-31,non-fare editOrder,Link,847430,847430,271486,271486,32.0,19.2,7.1,6.3,14.3,16.1,6.5,8.4,22.1
3,2023-10-01 to 2023-10-31,fare editOrder,Auto,442270,442270,13044,13044,2.9,18.4,4.7,2.6,4.6,14.3,3.8,7.5,42.9
4,2023-10-01 to 2023-10-31,fare editOrder,Link,847430,847430,26165,26165,3.1,22.6,10.6,4.6,12.5,12.4,4.7,6.0,25.5


In [31]:
df_july = df_first_ride_customers

In [34]:
df_aug = df_first_ride_customers

In [37]:
df_sep = df_first_ride_customers

In [40]:
df_oct = df_first_ride_customers

In [41]:
frames = [df_julyuly, df_aug, df_sep, df_oct]
result = pd.concat(frames)

In [45]:
result.sort_values(['price_shock_type', 'service_name']).to_clipboard(index=False)

## Extended analysis 

In [5]:
## Parameter 
start_date = '2023-07-01'
end_date = '2023-07-31'

In [16]:
## first ride customers 

first_ride_customers = f"""

    with all_customers as (

        select 
            customer_id as customerid,
            date_format(taxi_lifetime_first_ride_date,'%Y%m%d') as day
        from 
            datasets.iallocator_customer_segments
        where 
            date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') >= date_format(DATE('{start_date}'),'%Y-%m-%d')
            and date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') <= date_format(DATE('{end_date}'),'%Y-%m-%d')
            and run_date = date_format(DATE('{end_date}') + interval '1' day,'%Y-%m-%d')
            
        group by 1,2
        ),

        orders_tbl as (
        
        SELECT
            orderdate,
            city,
            service_name,
            estimate_id, 
            fare_recalculated_reason,
            customer_id,
            order_id,
            order_amount,
            fe_re_type,
            fe_re_diff_amount,
            u.id_array as fare_estimate_id
        FROM
            (
            SELECT   
                city_name as city,
                service_obj_service_name as service_name,
                customer_id,
                order_id,
                amount as order_amount,
                date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
                epoch,
                estimate_id,
                estimate_ids,
                fare_recalculated_reason,
                fare_recalculated_type AS fe_re_type,
                fare_recalculated_diff_amount AS fe_re_diff_amount,
                row_number() over(partition by customer_id order by epoch) as net_ride_number,
                cast(json_parse(estimate_ids) as array<varchar>) AS id_array
            FROM
                orders.order_logs_snapshot
            WHERE
                yyyymmdd >= date_format(DATE('{start_date}'),'%Y%m%d')
                AND yyyymmdd <= date_format(DATE('{end_date}'),'%Y%m%d')
                AND order_status = 'dropped'
                AND (spd_fraud_flag is null or spd_fraud_flag=false)
                AND customer_id in (select customerid from all_customers)
                AND service_obj_service_name IN ('Link', 'Auto', 'Auto Pool', 'CabEconomy')

            ) as t
        CROSS JOIN UNNEST(t.id_array) AS u(id_array)
        
        WHERE  
            net_ride_number in (1)
        ),
        
        
        fe_tbl as (
        SELECT
            user_id as customer_id,
            fare_estimate_id,
            service_name,
            api_context,
            epoch,
            date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
            cast(final_amount AS double) AS fe_amount
        FROM
            pricing.fare_estimates_enriched
        WHERE
            yyyymmdd >= date_format(DATE('{start_date}'),'%Y%m%d')
            AND yyyymmdd <= date_format(DATE('{end_date}'),'%Y%m%d')
            AND service_name IN ('Link', 'Auto', 'Auto Pool', 'CabEconomy')
            AND user_id in (select customerid from all_customers)
        ),

        merged_tbl as (
        
        SELECT  
            o.orderdate,
            o.service_name,
            o.customer_id,
            o.order_id,
            o.estimate_id,
            api_context,
            o.fare_estimate_id,
            o.fare_recalculated_reason,
            fe_amount,
            o.order_amount,
            o.order_amount -  fe_amount AS amt_diff,
            fe_re_type,
            fe_re_diff_amount,
            row_number() over (partition by o.estimate_id order by f.epoch asc)  as updated_seq 
        From orders_tbl o
        INNER JOIN fe_tbl f
        ON o.fare_estimate_id = f.fare_estimate_id
        AND o.service_name = f.service_name
        ),
        
        total_orders AS (
        
        SELECT
            min(orderdate) || ' to ' || max(orderdate) date_range,
            service_name,
            COUNT(DISTINCT order_id) total_orders,
            COUNT(DISTINCT customer_id) total_customer
        FROM
            orders_tbl
        GROUP BY 2
        ),
        
        price_shock AS (    

        SELECT
                service_name,
                MIN(orderdate) || ' to ' || MAX(orderdate) date_range,
                COUNT(DISTINCT order_id) as total_shocked_orders,
                COUNT(DISTINCT customer_id) total_shocked_customer
        FROM 
            merged_tbl
        WHERE 
            order_id in (SELECT order_id from merged_tbl where  amt_diff!=0)
            AND order_id not in (SELECT order_id from merged_tbl where api_context = '/fare/editOrder')
            AND updated_seq = 1
        GROUP BY 1
        )

    SELECT
        t.date_range,
        t.service_name,
        t.total_orders,
        total_customer,
        total_shocked_orders,
        total_shocked_customer,
        TRY(total_shocked_orders*100.0/total_orders) "shocked_orders%"

    FROM 
        total_orders AS t

    JOIN 
        price_shock AS ps 
        ON t.date_range = ps.date_range
        AND t.service_name = ps.service_name

    ORDER BY 2
"""


df_first_ride_customers = pd.read_sql(first_ride_customers, connection)
df_first_ride_customers

Unnamed: 0,date_range,service_name,total_orders,total_customer,total_shocked_orders,total_shocked_customer,shocked_orders%
0,2023-10-01 to 2023-10-31,Auto,442270,442270,148274,148274,33.5
1,2023-10-01 to 2023-10-31,CabEconomy,7305,7305,2238,2238,30.6
2,2023-10-01 to 2023-10-31,Link,847430,847430,271486,271486,32.0
