## Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as datetime

from scipy import stats
from pyhive import presto
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)

In [3]:
## Connection
connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike'
)

## Parameter 

In [94]:
## Parameter 
start_date = '2023-07-01'
end_date = '2023-07-31'

## Analysis

In [95]:
first_ride_customers = f"""
    with iallocator_customer_segments as (

        select 
            date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') as ftu_day,
            customer_id as customerid
        from 
            datasets.iallocator_customer_segments
        where 
            date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') >= date_format(DATE('{start_date}'),'%Y-%m-%d')
            and date_format(taxi_lifetime_first_ride_date,'%Y-%m-%d') <= date_format(DATE('{end_date}'),'%Y-%m-%d')
            and run_date = date_format(DATE('{end_date}') + interval '2' day,'%Y-%m-%d')
            
        group by 1,2
        ),
        
        order_logs_snapshot AS (
            
        SELECT
            orderdate,
            ftu_day,
            city,
            service_name,
            estimate_id, 
            fare_recalculated_reason,
            customer_id,
            order_id,
            net_ride_number,
            order_amount,
            fe_re_type,
            fe_re_diff_amount,
            u.id_array as fare_estimate_id
        FROM
            (
            SELECT   
                city_name as city,
                service_obj_service_name as service_name,
                customer_id,
                order_id,
                amount as order_amount,
                date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
                ftu_day,
                epoch,
                estimate_id,
                estimate_ids,
                fare_recalculated_reason,
                fare_recalculated_type AS fe_re_type,
                fare_recalculated_diff_amount AS fe_re_diff_amount,
                row_number() over(partition by customer_id order by epoch) as net_ride_number,
                cast(json_parse(estimate_ids) as array<varchar>) AS id_array
            FROM
                orders.order_logs_snapshot
            INNER JOIN 
                iallocator_customer_segments cs
                ON cs.customerid = customer_id
            
            WHERE
                yyyymmdd >= date_format(DATE('{start_date}'),'%Y%m%d')
                AND yyyymmdd <= date_format(DATE('{end_date}') + INTERVAL '45' DAY,'%Y%m%d')
                AND order_status = 'dropped'
                AND (spd_fraud_flag is null or spd_fraud_flag=false)
                AND service_obj_service_name IN ('Link', 'Auto', 'Auto Pool', 'CabEconomy')
            ) as t
        CROSS JOIN UNNEST(t.id_array) AS u(id_array)
        
        WHERE  
            net_ride_number in (1,2,3,4)
    ),

    fe_tbl as (
        SELECT
            user_id as customer_id,
            fare_estimate_id,
            service_name,
            api_context,
            epoch,
            date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
            cast(final_amount AS double) AS fe_amount
        FROM
            pricing.fare_estimates_enriched
        WHERE
            yyyymmdd >= date_format(DATE('{start_date}'),'%Y%m%d')
            AND yyyymmdd <= date_format(DATE('{end_date}') + INTERVAL '45' DAY,'%Y%m%d')
            AND service_name IN ('Link', 'Auto', 'Auto Pool', 'CabEconomy')
            AND user_id in (SELECT customerid FROM iallocator_customer_segments)
    ),

    merged_tbl as (
        SELECT 
            *
        FROM 
        (
        SELECT  
            o.orderdate,
            o.ftu_day,
            o.service_name,
            o.customer_id,
            o.order_id,
            o.estimate_id,
            o.net_ride_number,
            api_context,
            o.fare_estimate_id,
            o.fare_recalculated_reason,
            fe_amount,
            o.order_amount,
            o.order_amount -  fe_amount AS amt_diff,
            fe_re_type,
            fe_re_diff_amount,
            row_number() over (partition by o.estimate_id order by f.epoch asc)  as updated_seq 
        FROM 
            order_logs_snapshot o
        INNER JOIN 
            fe_tbl f
            ON o.fare_estimate_id = f.fare_estimate_id
            AND o.service_name = f.service_name
        )
        WHERE updated_seq = 1
    ),

    price_shock AS (    

        SELECT
            orderdate ftu_date,
            customer_id,
            'price_shock_in_ltr1' type 
        FROM 
            merged_tbl
        WHERE 
            order_id in (SELECT order_id from merged_tbl where  amt_diff!=0)
            AND order_id not in (SELECT order_id from merged_tbl where api_context = '/fare/editOrder')
            AND net_ride_number = 1
        ),
        
    ftu_service AS (
        
        SELECT
            service_name,
            customer_id
        FROM
            order_logs_snapshot
        WHERE 
            net_ride_number = 1
        GROUP BY 1,2
        )

    
        SELECT 
            date_format(DATE('{start_date}'),'%Y-%m-%d') || ' to ' || date_format(DATE('{end_date}'),'%Y-%m-%d') date_range,
            COALESCE(ps.type, 'non-price_shock_in_ltr1') type,
            ftu_service.service_name,
            COUNT(DISTINCT mt.customer_id) customer_ltr1,
            COUNT(DISTINCT CASE WHEN net_ride_number = 2 AND ftu_day <= orderdate THEN mt.customer_id END) customer_ltr2,
            COUNT(DISTINCT CASE WHEN net_ride_number = 3 AND ftu_day <= orderdate THEN mt.customer_id END) customer_ltr3,
            COUNT(DISTINCT CASE WHEN net_ride_number = 4 AND ftu_day <= orderdate THEN mt.customer_id END) customer_ltr4

        FROM 
            merged_tbl mt
        LEFT JOIN 
            price_shock ps   
            ON mt.customer_id = ps.customer_id
        LEFT JOIN
            ftu_service 
            ON ftu_service.customer_id = mt.customer_id
        
        GROUP BY 1,2,3

"""

df_first_ride_customers = pd.read_sql(first_ride_customers, connection)
df_first_ride_customers

Unnamed: 0,date_range,type,service_name,customer_ltr1,customer_ltr2,customer_ltr3,customer_ltr4
0,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,CabEconomy,2988,1758,2078,1901
1,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Auto Pool,4,3,4,3
2,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Auto,163054,89749,60641,44252
3,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Link,585526,336286,227370,167787
4,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Link,290064,153506,101331,73468
5,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Auto Pool,1,0,1,1
6,2023-07-01 to 2023-07-31,price_shock_in_ltr1,CabEconomy,1108,636,761,697
7,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Auto,277357,166052,116042,86225


In [96]:
df_first_ride_customers.customer_ltr1.sum()

1320102

In [97]:
df_july = df_first_ride_customers

In [98]:
df_july

Unnamed: 0,date_range,type,service_name,customer_ltr1,customer_ltr2,customer_ltr3,customer_ltr4
0,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,CabEconomy,2988,1758,2078,1901
1,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Auto Pool,4,3,4,3
2,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Auto,163054,89749,60641,44252
3,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Link,585526,336286,227370,167787
4,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Link,290064,153506,101331,73468
5,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Auto Pool,1,0,1,1
6,2023-07-01 to 2023-07-31,price_shock_in_ltr1,CabEconomy,1108,636,761,697
7,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Auto,277357,166052,116042,86225


In [90]:
df_aug

Unnamed: 0,date_range,type,service_name,customer_ltr1,customer_ltr2,customer_ltr3,customer_ltr4
0,2023-08-01 to 2023-08-31,non-price_shock_in_ltr1,Link,606230,350491,236980,174810
1,2023-08-01 to 2023-08-31,price_shock_in_ltr1,Link,305947,164766,108723,78562
2,2023-08-01 to 2023-08-31,non-price_shock_in_ltr1,Auto Pool,20,13,14,13
3,2023-08-01 to 2023-08-31,non-price_shock_in_ltr1,Auto,277679,164032,114432,84538
4,2023-08-01 to 2023-08-31,price_shock_in_ltr1,Auto Pool,3,3,2,1
5,2023-08-01 to 2023-08-31,non-price_shock_in_ltr1,CabEconomy,2543,1454,1709,1557
6,2023-08-01 to 2023-08-31,price_shock_in_ltr1,CabEconomy,1330,794,908,800
7,2023-08-01 to 2023-08-31,price_shock_in_ltr1,Auto,158927,86393,57676,41849


In [83]:
df_sep

Unnamed: 0,date_range,type,service_name,customer_ltr1,customer_ltr2,customer_ltr3,customer_ltr4
0,2023-09-01 to 2023-09-30,price_shock_in_ltr1,CabEconomy,2063,1280,1397,1195
1,2023-09-01 to 2023-09-30,price_shock_in_ltr1,Link,291891,151555,98052,69920
2,2023-09-01 to 2023-09-30,non-price_shock_in_ltr1,Link,543980,301349,199547,144531
3,2023-09-01 to 2023-09-30,non-price_shock_in_ltr1,CabEconomy,3794,2252,2545,2209
4,2023-09-01 to 2023-09-30,non-price_shock_in_ltr1,Auto,274360,155560,106115,77422
5,2023-09-01 to 2023-09-30,non-price_shock_in_ltr1,Auto Pool,58,39,44,42
6,2023-09-01 to 2023-09-30,price_shock_in_ltr1,Auto,159174,82537,54039,38480
7,2023-09-01 to 2023-09-30,price_shock_in_ltr1,Auto Pool,5,4,3,3


In [79]:
df_oct

Unnamed: 0,date_range,type,service_name,customer_ltr1,customer_ltr2,customer_ltr3,customer_ltr4
0,2023-10-01 to 2023-10-31,price_shock_in_ltr1,Auto,161205,67279,37317,23191
1,2023-10-01 to 2023-10-31,non-price_shock_in_ltr1,Auto,281162,130521,75659,48800
2,2023-10-01 to 2023-10-31,non-price_shock_in_ltr1,Link,550162,246827,140033,90957
3,2023-10-01 to 2023-10-31,price_shock_in_ltr1,CabEconomy,2448,1345,1349,1095
4,2023-10-01 to 2023-10-31,non-price_shock_in_ltr1,CabEconomy,4868,2839,2690,2149
5,2023-10-01 to 2023-10-31,price_shock_in_ltr1,Link,297428,122375,67082,42680
6,2023-10-01 to 2023-10-31,non-price_shock_in_ltr1,Auto Pool,90,70,54,38


In [100]:
frames = [df_july,df_aug,df_sep,df_oct]
result = pd.concat(frames)

In [102]:
result.columns

Index(['date_range', 'type', 'service_name', 'customer_ltr1', 'customer_ltr2',
       'customer_ltr3', 'customer_ltr4'],
      dtype='object')

In [103]:
result[['date_range', 'type', 'service_name', 'customer_ltr1', 'customer_ltr2',
       'customer_ltr3', 'customer_ltr4']]

Unnamed: 0,date_range,type,service_name,customer_ltr1,customer_ltr2,customer_ltr3,customer_ltr4
0,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,CabEconomy,2988,1758,2078,1901
1,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Auto Pool,4,3,4,3
2,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Auto,163054,89749,60641,44252
3,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Link,585526,336286,227370,167787
4,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Link,290064,153506,101331,73468
5,2023-07-01 to 2023-07-31,price_shock_in_ltr1,Auto Pool,1,0,1,1
6,2023-07-01 to 2023-07-31,price_shock_in_ltr1,CabEconomy,1108,636,761,697
7,2023-07-01 to 2023-07-31,non-price_shock_in_ltr1,Auto,277357,166052,116042,86225
0,2023-08-01 to 2023-08-31,non-price_shock_in_ltr1,Link,606230,350491,236980,174810
1,2023-08-01 to 2023-08-31,price_shock_in_ltr1,Link,305947,164766,108723,78562


In [104]:
result[['date_range', 'type', 'service_name', 'customer_ltr1', 'customer_ltr2',
       'customer_ltr3', 'customer_ltr4']].to_clipboard(index=False)