## Imports

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as datetime

from scipy import stats
from pyhive import presto
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)

In [3]:
## Connection
connection = presto.connect(
        host='presto-gateway.serving.data.production.internal',
        port=80,
        protocol='http',
        catalog='hive',
        username='manoj.ravirajan@rapido.bike'
)

## Parameter

In [4]:
## Parameter 
start_date = '20231019'
end_date = '20231027'

## Datasets

### clevertap_customer_orderactivity

##clevertap_customer_orderactivity

clevertap_customer_orderactivity = f"""

    SELECT
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') userId,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') appOpen_userId,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') currentCity,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') screen_version
        
    FROM 
        raw.clevertap_customer_events_master
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND profile_platform = 'Android'
        AND eventname = 'orderactivity'
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') IN ('M0.5', 'M0')
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.order_activity_source') = 'appOpen' 
    GROUP BY 1,2,3,4
"""

df_clevertap_customer_orderactivity = pd.read_sql(clevertap_customer_orderactivity, connection)
df_clevertap_customer_orderactivity \
                            .to_csv('clevertap_customer_orderactivity_{}_{}.csv' \
                                           .format(start_date,end_date), index=False)

In [5]:
## read local extract
df_clevertap_customer_orderactivity = pd.read_csv('clevertap_customer_orderactivity_{}_{}.csv'.format(start_date,end_date))

In [6]:
print(df_clevertap_customer_orderactivity.shape)
df_clevertap_customer_orderactivity.head(3)

(120508, 4)


Unnamed: 0,userId,appOpen_userId,currentCity,screen_version
0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,Bangalore,M0
1,6320ed1ddc7efa3b21179c4f,6320ed1ddc7efa3b21179c4f,Bangalore,M0.5
2,645212d3a3c24ec3b4ecdf38,645212d3a3c24ec3b4ecdf38,Hyderabad,M0


In [7]:
df_clevertap_customer_orderactivity \
        .groupby(['currentCity','screen_version']) \
        .agg({'appOpen_userId' : 'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,appOpen_userId
currentCity,screen_version,Unnamed: 2_level_1
Bangalore,M0,13135
Bangalore,M0.5,11485
Delhi,M0,15044
Delhi,M0.5,13686
Hyderabad,M0,32280
Hyderabad,M0.5,26604
Kolkata,M0,4376
Kolkata,M0.5,3898


### clevertap_customer_searchaddress

##clevertap_customer_searchaddress

clevertap_customer_searchaddress = f"""

    SELECT 
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') userId,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') searchaddress_userId,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') currentCity,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') screen_version
    FROM 
        raw.clevertap_customer_events_master
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND eventname = 'searchaddress'
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') IN ('M0.5', 'M0')

    GROUP BY 1,2,3,4
"""

df_clevertap_customer_searchaddress = pd.read_sql(clevertap_customer_searchaddress, connection)
df_clevertap_customer_searchaddress \
                            .to_csv('clevertap_customer_searchaddress_{}_{}.csv' \
                            .format(start_date,end_date), index=False)

In [8]:
## read local extract
df_clevertap_customer_searchaddress = pd.read_csv('clevertap_customer_searchaddress_{}_{}.csv'.format(start_date,end_date))

In [9]:
print(df_clevertap_customer_searchaddress.shape)
df_clevertap_customer_searchaddress.head(3)

(101837, 4)


Unnamed: 0,userId,searchaddress_userId,currentCity,screen_version
0,5cb338de54bc7263ff38e407,5cb338de54bc7263ff38e407,Hyderabad,M0
1,61aa42b217dd6a8bcc090c76,61aa42b217dd6a8bcc090c76,Hyderabad,M0
2,573f28ff9b0ffc2836771da1,573f28ff9b0ffc2836771da1,Bangalore,M0.5


In [10]:
df_clevertap_customer_searchaddress \
        .groupby(['currentCity','screen_version']) \
        .agg({'searchaddress_userId' : 'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,searchaddress_userId
currentCity,screen_version,Unnamed: 2_level_1
Bangalore,M0,10398
Bangalore,M0.5,10376
Delhi,M0,12386
Delhi,M0.5,12411
Hyderabad,M0,24787
Hyderabad,M0.5,24705
Kolkata,M0,3395
Kolkata,M0.5,3379


##code snippet 

selected_userId = df_clevertap_customer_orderactivity['userId'].unique()

unique_selected_userId = selected_userId.tolist()

unique_selected_userId_string = ', '.join(map(lambda x: f"'{x}'", unique_selected_userId))

print(unique_selected_userId_string)

### CT FE Data (Option 1)
#### clevertap_customer_fare_estimate

##clevertap_customer_fare_estimate & order_logs_snapshot

fare_estimate_to_request_ride = f"""

    WITH orderactivity_customer AS (
    
    SELECT 
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') userId
    FROM 
        raw.clevertap_customer_events_master
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND profile_platform = 'Android'
        AND eventname = 'orderactivity'
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.order_activity_source') = 'appOpen' 
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') IN ('M0.5', 'M0')
    GROUP BY 1
    ),
    
    clevertap_customer_fare_estimate AS (
    
    SELECT 
        fe.user_id fe_userId,
        city,
        fare_estimate_id,
        service_name,
        service_details_id service_detail_id
    FROM 
        canonical.clevertap_customer_fare_estimate fe
        
    JOIN
        orderactivity_customer ao
        ON ao.userId = fe.user_id
        
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND platform = 'Android'
        AND city IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        
    GROUP BY 1,2,3,4,5
    ),
    
    order_logs_snapshot AS (
    
    SELECT
        city,
        service_name,
        service_detail_id,
        customer_id,
        u.id_array as fare_estimate_id,
        order_id
    FROM
        (
        SELECT   
            -- date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y%m%d') AS orderdate,
            city_name as city,
            service_obj_service_name as service_name,
            service_detail_id,
            customer_id,
            order_id,
            estimate_id,
            estimate_ids,
            cast(json_parse(estimate_ids) as array<varchar>) AS id_array
        FROM
            orders.order_logs_snapshot

        WHERE
            yyyymmdd >= '{start_date}'
            AND yyyymmdd <= '{end_date}'
            AND city_name IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')

        ) as t
    CROSS JOIN UNNEST(t.id_array) AS u(id_array)
    
    JOIN 
        orderactivity_customer ao
        ON ao.userId = t.customer_id
    )
    
    SELECT 
        fe.*,
        rr.customer_id rr_customer,
        rr.fare_estimate_id rr_fare_estimate_id,
        rr.service_name rr_service_name,
        rr.order_id
        
    FROM 
        clevertap_customer_fare_estimate fe
    LEFT JOIN 
        order_logs_snapshot rr
        ON fe.fare_estimate_id = rr.fare_estimate_id
        AND fe.service_detail_id = rr.service_detail_id
    
"""

df_fare_estimate_to_request_ride = pd.read_sql(fare_estimate_to_request_ride, connection)
df_fare_estimate_to_request_ride \
                            .to_csv('fare_estimate_to_request_ride_{}_{}.csv' \
                            .format(start_date,end_date), index=False)

In [11]:
## read local extract
df_fare_estimate_to_request_ride = pd.read_csv('fare_estimate_to_request_ride_{}_{}.csv'.format(start_date,end_date))

In [12]:
print(df_fare_estimate_to_request_ride.shape)
df_fare_estimate_to_request_ride.head(3)

(1218086, 9)


Unnamed: 0,fe_userId,city,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id
0,652cfad521ea527fa52f89c5,Hyderabad,6538cc68ab40188444cce478,Car,,,,,
1,5fcde79e3ab0f7f797f380ff,Hyderabad,653a95c7c6aed60610270dc7,Bike,,,,,
2,6353b560d58af7c0a2d5a4a7,Delhi,6535f2390e639c012cb55c71,Link,5da4660028af187d8d52cc3c,6353b560d58af7c0a2d5a4a7,6535f2390e639c012cb55c71,Link,6535f28b1853204196819f35


In [13]:
df_fare_estimate_to_request_ride \
        .groupby(['city']) \
        .agg({'fe_userId' : 'nunique', ##'fare_estimate_id' : 'nunique', 
              'rr_customer' : 'nunique' ## 'order_id' : 'nunique'
             })

Unnamed: 0_level_0,fe_userId,rr_customer
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangalore,5353,3908
Delhi,12110,5330
Hyderabad,40096,6343
Kolkata,5771,1571


### CT FE Data (Option 2)
#### fare_estimates_enriched

##pricing.fare_estimates_enriched

pricing_fare_estimates_to_request_ride = f"""

    WITH orderactivity_customer AS (
    
    SELECT 
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') userId
    FROM 
        raw.clevertap_customer_events_master
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND profile_platform = 'Android'
        AND eventname = 'orderactivity'
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.order_activity_source') = 'appOpen' 
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') IN ('M0.5', 'M0')
    GROUP BY 1
    ),
    
    fare_estimates_enriched AS (
    
    SELECT 
        fe.user_id fe_userId,
        city,
        fare_estimate_id,
        service_name,
        service_detail_id
    FROM 
        pricing.fare_estimates_enriched fe
        
    JOIN
        orderactivity_customer ao
        ON ao.userId = fe.user_id
        
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND city IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')

    GROUP BY 1,2,3,4,5
    ),
    
    order_logs_snapshot AS (
    
    SELECT
        city,
        service_name,
        service_detail_id,
        customer_id,
        u.id_array as fare_estimate_id,
        order_id
    FROM
        (
        SELECT   
            -- date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y%m%d') AS orderdate,
            city_name as city,
            service_obj_service_name as service_name,
            service_detail_id,
            customer_id,
            order_id,
            estimate_id,
            estimate_ids,
            cast(json_parse(estimate_ids) as array<varchar>) AS id_array
        FROM
            orders.order_logs_snapshot

        WHERE
            yyyymmdd >= '{start_date}'
            AND yyyymmdd <= '{end_date}'
            AND city_name IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')

        ) as t
    CROSS JOIN UNNEST(t.id_array) AS u(id_array)
    
    JOIN 
        orderactivity_customer ao
        ON ao.userId = t.customer_id
    )
    
    SELECT 
        fe.*,
        rr.customer_id rr_customer,
        rr.fare_estimate_id rr_fare_estimate_id,
        rr.service_name rr_service_name,
        rr.order_id
        
    FROM 
        fare_estimates_enriched fe
    LEFT JOIN 
        order_logs_snapshot rr
        ON fe.fare_estimate_id = rr.fare_estimate_id
        AND fe.service_detail_id = rr.service_detail_id
    
    
"""

df_pricing_fare_estimates_to_request_ride = pd.read_sql(pricing_fare_estimates_to_request_ride, connection)
df_pricing_fare_estimates_to_request_ride \
                            .to_csv('pricing_fare_estimates_to_request_ride_{}_{}.csv' \
                            .format(start_date,end_date), index=False)

In [14]:
## read local extract
df_pricing_fare_estimates_to_request_ride = pd.read_csv('pricing_fare_estimates_to_request_ride_{}_{}.csv'.format(start_date,end_date))

In [15]:
print(df_pricing_fare_estimates_to_request_ride.shape)
df_pricing_fare_estimates_to_request_ride.head(3)

(4423128, 9)


Unnamed: 0,fe_userId,city,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id
0,623c26c9cbae733350bb3201,Delhi,6533859b83e7882223ba7ffe,Link,5da4660028af187d8d52cc3c,623c26c9cbae733350bb3201,6533859b83e7882223ba7ffe,Link,653385a00bf3de00b3d749b9
1,6532720064ed43d3270ceddf,Bangalore,6533c63e5f4e3196c604be72,Link,57370b61a6855d70057417d1,6532720064ed43d3270ceddf,6533c63e5f4e3196c604be72,Link,6533c647e2dd8540d4e2e47a
2,62c8ebb49adb4ef855302688,Hyderabad,6538f7c06cd8d201fe3d1c40,Auto,5ef2bc5b85846b775f97d170,,,,


In [16]:
df_pricing_fare_estimates_to_request_ride \
        .groupby(['city']) \
        .agg({'fe_userId' : 'nunique', #'fare_estimate_id' : 'nunique', 
              'rr_customer' : 'nunique' #, 'order_id' : 'nunique'
             })

Unnamed: 0_level_0,fe_userId,rr_customer
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangalore,21789,18054
Delhi,25611,20656
Hyderabad,50889,42083
Kolkata,7076,5127


### CT vs. Pricing FE dataset

In [17]:
print('canonical.clevertap_customer_fare_estimate \n ---------------------------------')

df_fare_estimate_to_request_ride \
        .groupby(['city']) \
        .agg({'fe_userId' : 'nunique', #'fare_estimate_id' : 'nunique', 
              'rr_customer' : 'nunique' #, 'order_id' : 'nunique'
             })

canonical.clevertap_customer_fare_estimate 
 ---------------------------------


Unnamed: 0_level_0,fe_userId,rr_customer
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangalore,5353,3908
Delhi,12110,5330
Hyderabad,40096,6343
Kolkata,5771,1571


In [18]:
print('pricing.fare_estimates_enriched \n ---------------------------------')

df_pricing_fare_estimates_to_request_ride \
        .groupby(['city']) \
        .agg({'fe_userId' : 'nunique', # 'fare_estimate_id' : 'nunique', 
              'rr_customer' : 'nunique' #, 'order_id' : 'nunique'
             })

pricing.fare_estimates_enriched 
 ---------------------------------


Unnamed: 0_level_0,fe_userId,rr_customer
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangalore,21789,18054
Delhi,25611,20656
Hyderabad,50889,42083
Kolkata,7076,5127


- pricing.fare_estimates_enriched having appropriate data points.

## iallocator_customer_segments

##iallocator_customer_segments

iallocator_customer_segments = f"""

    SELECT 
        customer_id,
        taxi_service_affinity service_affinity  
    FROM 
        datasets.iallocator_customer_segments
    WHERE 
        run_date = DATE_FORMAT(DATE_PARSE('{start_date}', '%Y%m%d'), '%Y-%m-%d')
        AND taxi_recency_segment != 'INACTIVE'
        AND taxi_lifetime_rides > 1
        -- AND taxi_lifetime_last_ride_city IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
"""

df_iallocator_customer_segments = pd.read_sql(iallocator_customer_segments, connection)
df_iallocator_customer_segments \
                            .to_csv('iallocator_customer_segments_{}.csv' \
                            .format(start_date), index=False)

In [19]:
## read local extract
df_iallocator_customer_segments = pd.read_csv('iallocator_customer_segments_{}.csv'.format(start_date))

In [20]:
print(df_iallocator_customer_segments.shape)
df_iallocator_customer_segments.head(3)

(13306192, 2)


Unnamed: 0,customer_id,service_affinity
0,629906d81b57e40f50e51220,ONLY_LINK
1,62990dd392702e47b04ddbf5,ONLY_LINK
2,6299626a1b57e47fb2e539b8,ONLY_LINK


## Analysis

### Initital Funnel

In [21]:
print(df_clevertap_customer_orderactivity.columns)
print(df_clevertap_customer_searchaddress.columns)
print(df_fare_estimate_to_request_ride.columns)
print(df_pricing_fare_estimates_to_request_ride.columns)
print(df_iallocator_customer_segments.columns)

Index(['userId', 'appOpen_userId', 'currentCity', 'screen_version'], dtype='object')
Index(['userId', 'searchaddress_userId', 'currentCity', 'screen_version'], dtype='object')
Index(['fe_userId', 'city', 'fare_estimate_id', 'service_name',
       'service_detail_id', 'rr_customer', 'rr_fare_estimate_id',
       'rr_service_name', 'order_id'],
      dtype='object')
Index(['fe_userId', 'city', 'fare_estimate_id', 'service_name',
       'service_detail_id', 'rr_customer', 'rr_fare_estimate_id',
       'rr_service_name', 'order_id'],
      dtype='object')
Index(['customer_id', 'service_affinity'], dtype='object')


In [22]:
## orderactivity & searchaddress

df_ao_sa = pd.merge(df_clevertap_customer_orderactivity,
                    df_clevertap_customer_searchaddress,
                    how='left',
                    left_on=['userId', 'screen_version', 'currentCity'],
                    right_on=['userId', 'screen_version',  'currentCity']
                   )

df_ao_sa = df_ao_sa[['currentCity', 'screen_version', 'appOpen_userId', 'searchaddress_userId']]
df_ao_sa.head(3)

Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId
0,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4
1,Bangalore,M0.5,6320ed1ddc7efa3b21179c4f,6320ed1ddc7efa3b21179c4f
2,Hyderabad,M0,645212d3a3c24ec3b4ecdf38,645212d3a3c24ec3b4ecdf38


In [23]:
df_ao_sa \
    .groupby(['currentCity', 'screen_version']) \
    .agg({'appOpen_userId': 'nunique', 'searchaddress_userId' : 'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,appOpen_userId,searchaddress_userId
currentCity,screen_version,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangalore,M0,13135,10113
Bangalore,M0.5,11485,9768
Delhi,M0,15044,11999
Delhi,M0.5,13686,11866
Hyderabad,M0,32280,24071
Hyderabad,M0.5,26604,23147
Kolkata,M0,4376,3244
Kolkata,M0.5,3898,3187


In [24]:
df_pricing_fare_estimates_to_request_ride.columns

Index(['fe_userId', 'city', 'fare_estimate_id', 'service_name',
       'service_detail_id', 'rr_customer', 'rr_fare_estimate_id',
       'rr_service_name', 'order_id'],
      dtype='object')

In [25]:
## orderactivity & searchaddress

df_ao_to_rr = pd.merge(df_ao_sa,
                        df_pricing_fare_estimates_to_request_ride,
                        how='left',
                        left_on=['searchaddress_userId', 'currentCity'],
                        right_on=['fe_userId',  'city']
                   )

df_ao_to_rr = pd.merge(df_ao_to_rr,
                       df_iallocator_customer_segments,
                       how='left',
                       left_on=['appOpen_userId'],
                       right_on=['customer_id']
                      )
df_ao_to_rr['service_affinity'].fillna('UNKNOWN', inplace=True)

df_ao_to_rr = df_ao_to_rr[['currentCity', 'screen_version', 'appOpen_userId', 'searchaddress_userId',
                           'fe_userId', 'fare_estimate_id', 'service_name', 'service_detail_id',
                           'rr_customer', 'rr_fare_estimate_id', 'rr_service_name', 'order_id',
                           'service_affinity'
                          ]]
df_ao_to_rr.head(3)

Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id,service_affinity
0,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bbfab401821b8ac67ec,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK
1,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,57370b61a6855d70057417d1,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,65359bd64c28820c53ac3784,ONLY_LINK
2,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359c32fd3e49203c7fdee7,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK


In [26]:
df_ao_to_rr.service_affinity.unique()

array(['ONLY_LINK', 'UNKNOWN', 'ONLY_AUTO', 'BOTH'], dtype=object)

In [27]:
## Refined df
df_ao_to_rr.head()

Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id,service_affinity
0,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bbfab401821b8ac67ec,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK
1,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,57370b61a6855d70057417d1,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,65359bd64c28820c53ac3784,ONLY_LINK
2,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359c32fd3e49203c7fdee7,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK
3,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bbfab401821b8ac67ec,C2C,5e8a19c23c89412b94731fbc,,,,,ONLY_LINK
4,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359c32fd3e49203c7fdee7,Auto,5c53562fceb6fc9241980547,61716eb34c6ba16017db28e4,65359c32fd3e49203c7fdee7,Auto,65359c3adc9b7e3eae70aebf,ONLY_LINK


## Sampling bias

- Objective here is to double validate is there any samping bias between test/control group.
- Check behaviour of customer between M0 vs. M0.5
- Check can be done in terms of distribution of customers segment, fe, rr, net orders and others

### Procedure <br>
Take tc group customer and see the desired distributions on pre-period 

In [28]:
## Pre-period Parameter

## Month of OCT
start_date = start_date
end_date = start_date

## Month of SEP (2 weeks)
pre_start_date = '20230904'
pre_end_date = '20230917'

##sampling_bias

sampling_bias = f"""

    WITH orderactivity_customer AS (
    
    SELECT 
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.userId') userId,
        JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') screen_version
    FROM 
        raw.clevertap_customer_events_master
    WHERE 
        yyyymmdd >= '{start_date}'
        AND yyyymmdd <= '{end_date}'
        AND profile_platform = 'Android'
        AND eventname = 'orderactivity'
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.order_activity_source') = 'appOpen' 
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.currentCity') IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        AND JSON_EXTRACT_SCALAR(eventProps, '$.eventProps.screen_version') IN ('M0.5', 'M0')
    GROUP BY 1,2
    ),
    
    fare_estimates_enriched AS (
    
    SELECT 
        ao.screen_version,
        fe.user_id fe_userId,
        city,
        fare_estimate_id,
        service_name,
        service_detail_id
    FROM 
        pricing.fare_estimates_enriched fe
        
    JOIN
        orderactivity_customer ao
        ON ao.userId = fe.user_id
        
    WHERE 
        yyyymmdd >= '{pre_start_date}'
        AND yyyymmdd <= '{pre_end_date}'
        AND city IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')

    GROUP BY 1,2,3,4,5,6
    ),
    
    order_logs_snapshot AS (
    
    SELECT
        city,
        service_name,
        service_detail_id,
        customer_id,
        u.id_array as fare_estimate_id,
        order_id,
        order_status,
        spd_fraud_flag
    FROM
        (
        SELECT   
            -- date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y%m%d') AS orderdate,
            city_name as city,
            service_obj_service_name as service_name,
            service_detail_id,
            customer_id,
            order_id,
            order_status,
            spd_fraud_flag,
            estimate_id,
            estimate_ids,
            cast(json_parse(estimate_ids) as array<varchar>) AS id_array
        FROM
            orders.order_logs_snapshot

        WHERE
            yyyymmdd >= '{pre_start_date}'
            AND yyyymmdd <= '{pre_end_date}'
            AND city_name IN ('Hyderabad', 'Delhi', 'Kolkata', 'Bangalore')
        ) as t
    CROSS JOIN UNNEST(t.id_array) AS u(id_array)
    
    JOIN 
        orderactivity_customer ao
        ON ao.userId = t.customer_id
    )
    
    SELECT 
        fe.*,
        rr.customer_id rr_customer,
        rr.fare_estimate_id rr_fare_estimate_id,
        rr.service_name rr_service_name,
        rr.order_id,
        rr.order_status,
        rr.spd_fraud_flag,
        COALESCE(taxi_recency_segment, 'NA') recency_segment,
        COALESCE(taxi_lifetime_stage, 'NA') lifetime_stage,
        COALESCE(taxi_service_affinity, 'NA') service_affinity
        
    FROM 
        fare_estimates_enriched fe
    
    LEFT JOIN 
        order_logs_snapshot rr
        ON fe.fare_estimate_id = rr.fare_estimate_id
        AND fe.service_detail_id = rr.service_detail_id
    
    LEFT JOIN 
        datasets.iallocator_customer_segments cs
        ON fe.fe_userId = cs.customer_id
        AND run_date = '2023-09-17'
        AND taxi_recency_segment != 'INACTIVE'
        AND taxi_lifetime_rides > 1 
        
"""

df_sampling_bias = pd.read_sql(sampling_bias, connection)
df_sampling_bias \
            .to_csv('sampling_bias_{}_{}_{}_{}.csv' \
            .format(start_date,end_date,pre_start_date,pre_end_date), index=False)

In [29]:
## read local extract
df_sampling_bias = pd.read_csv('sampling_bias_{}_{}_{}_{}.csv'.format(start_date,end_date,pre_start_date,pre_end_date))

In [30]:
print(df_sampling_bias.shape)
df_sampling_bias.head(3)

(2258649, 15)


Unnamed: 0,screen_version,fe_userId,city,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id,order_status,spd_fraud_flag,recency_segment,lifetime_stage,service_affinity
0,M0,5c30b70d4a267149c766a703,Bangalore,65013d3cc1c1287360d9a31b,Link,57370b61a6855d70057417d1,5c30b70d4a267149c766a703,65013d3cc1c1287360d9a31b,Link,65013d439be64968d5bc19d4,dropped,False,RECENT,COMMITTED,ONLY_LINK
1,M0.5,63f9c293f102ae570533b68e,Hyderabad,64f73546a533ecd462000633,Auto,5ef2bc5b85846b775f97d170,63f9c293f102ae570533b68e,64f73546a533ecd462000633,Auto,64f7358d3055877469514d57,expired,,RECENT,DETOX,ONLY_LINK
2,M0.5,63f9c293f102ae570533b68e,Hyderabad,64f73546a533ecd462000633,Auto,5ef2bc5b85846b775f97d170,63f9c293f102ae570533b68e,64f73546a533ecd462000633,Auto,64f737f0a00648479af970b6,customerCancelled,,RECENT,DETOX,ONLY_LINK


In [31]:
sampling_bias_backup = df_sampling_bias

### Funnel metrics

In [32]:
df_analysis1 = sampling_bias_backup \
                    .groupby(['city', 'screen_version']) \
                    .agg(fe_customers = pd.NamedAgg('fe_userId' , 'nunique'),
                         rr_customers = pd.NamedAgg('rr_customer' , 'nunique'),
                         net_customers = ('rr_customer', 
                                          lambda x: x[(sampling_bias_backup['order_status'] == 'dropped') 
                                                      &
                                                      (sampling_bias_backup['spd_fraud_flag'] != True)
                                                     ] \
                                          .nunique()),
                         fe_count = pd.NamedAgg('fare_estimate_id' , 'nunique'),
                         rr_count = pd.NamedAgg('order_id' , 'nunique'),
                         net_count = ('order_id', 
                                      lambda x: x[(sampling_bias_backup['order_status'] == 'dropped')
                                                  &
                                                  (sampling_bias_backup['spd_fraud_flag'] != True)
                                                 ] \
                                      .nunique())
                        ) \
                    .reset_index()

## Adding new columns

df_analysis1['FE2RR-User'] = df_analysis1['rr_customers']*100.0/df_analysis1['fe_customers']
df_analysis1['G2N-User'] = df_analysis1['net_customers']*100.0/df_analysis1['rr_customers']
df_analysis1['FE2NET-User'] = df_analysis1['net_customers']*100.0/df_analysis1['fe_customers']

df_analysis1['FE2RR'] = df_analysis1['rr_count']*100.0/df_analysis1['fe_count']
df_analysis1['G2N'] = df_analysis1['net_count']*100.0/df_analysis1['rr_count']
df_analysis1['FE2NET'] = df_analysis1['net_count']*100.0/df_analysis1['fe_count']

In [33]:
df_analysis1.round(2)

Unnamed: 0,city,screen_version,fe_customers,rr_customers,net_customers,fe_count,rr_count,net_count,FE2RR-User,G2N-User,FE2NET-User,FE2RR,G2N,FE2NET
0,Bangalore,M0,2351,2151,1886,71063,29685,10993,91.49,87.68,80.22,41.77,37.03,15.47
1,Bangalore,M0.5,2087,1934,1724,67601,28272,10303,92.67,89.14,82.61,41.82,36.44,15.24
2,Delhi,M0,2614,2374,2241,75282,28164,17924,90.82,94.4,85.73,37.41,63.64,23.81
3,Delhi,M0.5,2653,2453,2321,79045,29577,18759,92.46,94.62,87.49,37.42,63.42,23.73
4,Hyderabad,M0,5720,5136,4574,142575,54339,24117,89.79,89.06,79.97,38.11,44.38,16.92
5,Hyderabad,M0.5,5048,4620,4145,132054,51189,22735,91.52,89.72,82.11,38.76,44.41,17.22
6,Kolkata,M0,617,518,472,11660,5153,2726,83.95,91.12,76.5,44.19,52.9,23.38
7,Kolkata,M0.5,569,485,443,11393,4940,2486,85.24,91.34,77.86,43.36,50.32,21.82


### recency_segment

In [56]:
df_analysis2 = sampling_bias_backup \
                    .groupby(['city', 'screen_version', 'recency_segment']) \
                    .agg(fe_customers = pd.NamedAgg('fe_userId' , 'nunique'),
                         rr_customers = pd.NamedAgg('rr_customer' , 'nunique'),
                         net_customers = ('rr_customer', 
                                          lambda x: x[(sampling_bias_backup['order_status'] == 'dropped') 
                                                      &
                                                      (sampling_bias_backup['spd_fraud_flag'] != True)
                                                     ] \
                                          .nunique()),
                         fe_count = pd.NamedAgg('fare_estimate_id' , 'nunique'),
                         rr_count = pd.NamedAgg('order_id' , 'nunique'),
                         net_count = ('order_id', 
                                      lambda x: x[(sampling_bias_backup['order_status'] == 'dropped')
                                                  &
                                                  (sampling_bias_backup['spd_fraud_flag'] != True)
                                                 ] \
                                      .nunique())
                        ) \
                    .reset_index()

## Adding new columns
df_analysis2a = df_analysis2 \
                    .groupby(['city', 'screen_version']) \
                    .agg(total_fe_customer = pd.NamedAgg('fe_customers' , 'sum'),
                         total_rr_customer = pd.NamedAgg('rr_customers' , 'sum'),
                         total_net_customer = pd.NamedAgg('net_customers' , 'sum'),
                         total_fe_count = pd.NamedAgg('fe_count' , 'sum'),
                         total_rr_count = pd.NamedAgg('rr_count' , 'sum'),
                         total_net_count = pd.NamedAgg('net_count' , 'sum')
                        ) \
                    .reset_index()

In [57]:
df_analysis2b = pd.merge(df_analysis2a,
                         df_analysis2,
                         how='left',
                         left_on=['city','screen_version'],
                         right_on=['city','screen_version']
                        )

df_analysis2b['fe_customers distr'] = df_analysis2b['fe_customers']*100.0/df_analysis2b['total_fe_customer']
df_analysis2b['rr_customers distr'] = df_analysis2b['rr_customers']*100.0/df_analysis2b['total_rr_customer']
df_analysis2b['net_customers distr'] = df_analysis2b['net_customers']*100.0/df_analysis2b['total_net_customer']
df_analysis2b['fe distr'] = df_analysis2b['fe_count']*100.0/df_analysis2b['total_fe_count']
df_analysis2b['rr distr'] = df_analysis2b['rr_count']*100.0/df_analysis2b['total_rr_count']
df_analysis2b['net distr'] = df_analysis2b['net_count']*100.0/df_analysis2b['total_net_count']

df_analysis2b[['city', 'screen_version', 'recency_segment', 
              'fe_customers distr', 'rr_customers distr', 'net_customers distr', 'fe distr', 'rr distr', 'net distr']] \
                    .sort_values(['city', 'recency_segment','screen_version']) \
                    .round(2) \
                    .head()

Unnamed: 0,city,screen_version,recency_segment,fe_customers distr,rr_customers distr,net_customers distr,fe distr,rr distr,net distr
0,Bangalore,M0,DORMANT,9.87,6.23,0.22,2.38,1.38,0.05
3,Bangalore,M0.5,DORMANT,7.18,4.3,0.18,1.4,0.82,0.03
1,Bangalore,M0,RECENT,70.03,73.87,81.69,87.69,90.41,93.62
4,Bangalore,M0.5,RECENT,73.56,77.1,83.52,89.81,91.27,93.97
2,Bangalore,M0,STATIONARY,20.1,19.9,18.09,9.94,8.21,6.33


### lifetime_stage

In [58]:
df_analysis3 = sampling_bias_backup \
                    .groupby(['city', 'screen_version', 'lifetime_stage']) \
                    .agg(fe_customers = pd.NamedAgg('fe_userId' , 'nunique'),
                         rr_customers = pd.NamedAgg('rr_customer' , 'nunique'),
                         net_customers = ('rr_customer', 
                                          lambda x: x[(sampling_bias_backup['order_status'] == 'dropped') 
                                                      &
                                                      (sampling_bias_backup['spd_fraud_flag'] != True)
                                                     ] \
                                          .nunique()),
                         fe_count = pd.NamedAgg('fare_estimate_id' , 'nunique'),
                         rr_count = pd.NamedAgg('order_id' , 'nunique'),
                         net_count = ('order_id', 
                                      lambda x: x[(sampling_bias_backup['order_status'] == 'dropped')
                                                  &
                                                  (sampling_bias_backup['spd_fraud_flag'] != True)
                                                 ] \
                                      .nunique())
                        ) \
                    .reset_index()

## Adding new columns
df_analysis3a = df_analysis3 \
                    .groupby(['city', 'screen_version']) \
                    .agg(total_fe_customer = pd.NamedAgg('fe_customers' , 'sum'),
                         total_rr_customer = pd.NamedAgg('rr_customers' , 'sum'),
                         total_net_customer = pd.NamedAgg('net_customers' , 'sum'),
                         total_fe_count = pd.NamedAgg('fe_count' , 'sum'),
                         total_rr_count = pd.NamedAgg('rr_count' , 'sum'),
                         total_net_count = pd.NamedAgg('net_count' , 'sum')
                        ) \
                    .reset_index()

In [59]:
df_analysis3b = pd.merge(df_analysis3a,
                         df_analysis3,
                         how='left',
                         left_on=['city','screen_version'],
                         right_on=['city','screen_version']
                        )

df_analysis3b['fe_customers distr'] = df_analysis3b['fe_customers']*100.0/df_analysis3b['total_fe_customer']
df_analysis3b['rr_customers distr'] = df_analysis3b['rr_customers']*100.0/df_analysis3b['total_rr_customer']
df_analysis3b['net_customers distr'] = df_analysis3b['net_customers']*100.0/df_analysis3b['total_net_customer']
df_analysis3b['fe distr'] = df_analysis3b['fe_count']*100.0/df_analysis3b['total_fe_count']
df_analysis3b['rr distr'] = df_analysis3b['rr_count']*100.0/df_analysis3b['total_rr_count']
df_analysis3b['net distr'] = df_analysis3b['net_count']*100.0/df_analysis3b['total_net_count']


df_analysis3b[['city', 'screen_version', 'lifetime_stage', 
              'fe_customers distr', 'rr_customers distr',  'net_customers distr', 'fe distr', 'rr distr', 'net distr']] \
        .sort_values(['city', 'lifetime_stage','screen_version']) \
        .round(2) \
        .head()

Unnamed: 0,city,screen_version,lifetime_stage,fe_customers distr,rr_customers distr,net_customers distr,fe distr,rr distr,net distr
0,Bangalore,M0,CHURN_OTB,3.06,2.77,2.88,1.16,0.82,1.09
9,Bangalore,M0.5,CHURN_OTB,3.82,3.49,3.28,1.12,0.89,0.97
1,Bangalore,M0,COMMITTED,46.32,49.15,53.94,67.78,72.61,74.09
10,Bangalore,M0.5,COMMITTED,48.24,50.81,54.31,68.67,72.59,73.63
2,Bangalore,M0,DETOX,7.58,7.83,8.31,10.68,10.05,10.04


<br>
<br>

## Sampling bias - Summary

### Pre-period funnel metric comparision
### Recency Segment
### Lifetime Stage

In [66]:
print('\n\nHome M0 vs. M0.5 test/control pre-period funnel metric comparision\nSep-04 to Sep-17')
df_analysis1.round(2)



Home M0 vs. M0.5 test/control pre-period funnel metric comparision
Sep-04 to Sep-17


Unnamed: 0,city,screen_version,fe_customers,rr_customers,net_customers,fe_count,rr_count,net_count,FE2RR-User,G2N-User,FE2NET-User,FE2RR,G2N,FE2NET
0,Bangalore,M0,2351,2151,1886,71063,29685,10993,91.49,87.68,80.22,41.77,37.03,15.47
1,Bangalore,M0.5,2087,1934,1724,67601,28272,10303,92.67,89.14,82.61,41.82,36.44,15.24
2,Delhi,M0,2614,2374,2241,75282,28164,17924,90.82,94.4,85.73,37.41,63.64,23.81
3,Delhi,M0.5,2653,2453,2321,79045,29577,18759,92.46,94.62,87.49,37.42,63.42,23.73
4,Hyderabad,M0,5720,5136,4574,142575,54339,24117,89.79,89.06,79.97,38.11,44.38,16.92
5,Hyderabad,M0.5,5048,4620,4145,132054,51189,22735,91.52,89.72,82.11,38.76,44.41,17.22
6,Kolkata,M0,617,518,472,11660,5153,2726,83.95,91.12,76.5,44.19,52.9,23.38
7,Kolkata,M0.5,569,485,443,11393,4940,2486,85.24,91.34,77.86,43.36,50.32,21.82


#### Take 

- Order level metrics 
    
    FE2RR/FE2NET -> Similary behaviour on M0 vs M0.5 (slight difference in Kolkata )
    
    
- Customer level metrics
    
    FE2RR User/FE2NET User -> Consistence ~2 % delta between M0 vs M0.5

In [63]:
print('\n\nHome M0 vs. M0.5 test/control pre-period recency_segment comparision')
df_analysis2b[['city', 'screen_version', 'recency_segment', 
               'fe_customers distr', 'rr_customers distr', 'net_customers distr', 
               'fe distr', 'rr distr', 'net distr']] \
        .sort_values(['city', 'recency_segment','screen_version']) \
        .round(2)



Home M0 vs. M0.5 test/control pre-period recency_segment comparision


Unnamed: 0,city,screen_version,recency_segment,fe_customers distr,rr_customers distr,net_customers distr,fe distr,rr distr,net distr
0,Bangalore,M0,DORMANT,9.87,6.23,0.22,2.38,1.38,0.05
3,Bangalore,M0.5,DORMANT,7.18,4.3,0.18,1.4,0.82,0.03
1,Bangalore,M0,RECENT,70.03,73.87,81.69,87.69,90.41,93.62
4,Bangalore,M0.5,RECENT,73.56,77.1,83.52,89.81,91.27,93.97
2,Bangalore,M0,STATIONARY,20.1,19.9,18.09,9.94,8.21,6.33
5,Bangalore,M0.5,STATIONARY,19.26,18.6,16.3,8.79,7.91,6.0
6,Delhi,M0,DORMANT,6.28,2.66,0.59,1.24,0.53,0.21
9,Delhi,M0.5,DORMANT,5.58,2.31,0.61,1.09,0.57,0.31
7,Delhi,M0,RECENT,80.04,84.71,88.22,93.67,95.59,96.48
10,Delhi,M0.5,RECENT,80.88,85.07,88.35,93.35,95.16,96.44


#### Take 
    
    M0 DORMANT > M0.5 DORMANT 
    M0 RECENT < M0.5 RECENT (Except Kolkata)
    
    
    Bangalore -> 
        Similary NET distribution 
      
    Delhi -> 
        Similary distribution 
        
    Hyderabad -> 
        Similary distribution 
    
    Kolkata -> 
        NET distr -> M0 RECENT > M0.5 RECENT

In [62]:
print('\n\nHome M0 vs. M0.5 test/control pre-period lifetime_stage comparision')
df_analysis3b[['city', 'screen_version', 'lifetime_stage', 
              'fe_customers distr', 'rr_customers distr', 'net_customers distr', 'fe distr', 'rr distr', 'net distr']] \
        .sort_values(['city', 'lifetime_stage','screen_version']) \
        .round(2)



Home M0 vs. M0.5 test/control pre-period lifetime_stage comparision


Unnamed: 0,city,screen_version,lifetime_stage,fe_customers distr,rr_customers distr,net_customers distr,fe distr,rr distr,net distr
0,Bangalore,M0,CHURN_OTB,3.06,2.77,2.88,1.16,0.82,1.09
9,Bangalore,M0.5,CHURN_OTB,3.82,3.49,3.28,1.12,0.89,0.97
1,Bangalore,M0,COMMITTED,46.32,49.15,53.94,67.78,72.61,74.09
10,Bangalore,M0.5,COMMITTED,48.24,50.81,54.31,68.67,72.59,73.63
2,Bangalore,M0,DETOX,7.58,7.83,8.31,10.68,10.05,10.04
11,Bangalore,M0.5,DETOX,8.91,9.09,9.15,12.18,11.23,11.71
3,Bangalore,M0,DORMANT,9.87,6.23,0.22,2.38,1.38,0.05
12,Bangalore,M0.5,DORMANT,7.18,4.3,0.18,1.4,0.82,0.03
4,Bangalore,M0,HANDHOLDING,3.61,3.7,3.86,1.79,1.13,1.24
13,Bangalore,M0.5,HANDHOLDING,3.67,3.66,3.7,1.85,1.46,1.26


<br>
<br>
<br>

## Validate the Home M0.5 Funnel Data Point

## Claim

In [41]:
df_ao_to_rr.head(3)

Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id,service_affinity
0,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bbfab401821b8ac67ec,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK
1,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,57370b61a6855d70057417d1,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,65359bd64c28820c53ac3784,ONLY_LINK
2,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359c32fd3e49203c7fdee7,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK


In [67]:
dfAgg_ao_to_rr =  df_ao_to_rr \
                        .groupby(['currentCity', 'screen_version']) \
                        .agg({'appOpen_userId': 'nunique', 
                              'searchaddress_userId' : 'nunique',
                              'fe_userId' : 'nunique',
                              'rr_customer' : 'nunique'
                             }).reset_index()
dfAgg_ao_to_rr

Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,rr_customer
0,Bangalore,M0,13135,10113,9982,8356
1,Bangalore,M0.5,11485,9768,9657,8158
2,Delhi,M0,15044,11999,11822,9545
3,Delhi,M0.5,13686,11866,11732,9702
4,Hyderabad,M0,32280,24071,23714,19716
5,Hyderabad,M0.5,26604,23147,22871,19385
6,Kolkata,M0,4376,3244,3179,2271
7,Kolkata,M0.5,3898,3187,3112,2319


In [68]:
dfAgg_ao_to_rr['AO2RR'] = dfAgg_ao_to_rr['rr_customer']*100.0/dfAgg_ao_to_rr['appOpen_userId']
dfAgg_ao_to_rr['AO2FE'] = dfAgg_ao_to_rr['fe_userId']*100.0/dfAgg_ao_to_rr['appOpen_userId']
dfAgg_ao_to_rr['AO2SA'] = dfAgg_ao_to_rr['searchaddress_userId']*100.0/dfAgg_ao_to_rr['appOpen_userId']
dfAgg_ao_to_rr['SA2FE'] = dfAgg_ao_to_rr['fe_userId']*100.0/dfAgg_ao_to_rr['searchaddress_userId']
dfAgg_ao_to_rr['FE2RR'] = dfAgg_ao_to_rr['rr_customer']*100.0/dfAgg_ao_to_rr['fe_userId']
print('Unique customer \n --------------------')
dfAgg_ao_to_rr.round(2)

Unique customer 
 --------------------


Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,rr_customer,AO2RR,AO2FE,AO2SA,SA2FE,FE2RR
0,Bangalore,M0,13135,10113,9982,8356,63.62,76.0,76.99,98.7,83.71
1,Bangalore,M0.5,11485,9768,9657,8158,71.03,84.08,85.05,98.86,84.48
2,Delhi,M0,15044,11999,11822,9545,63.45,78.58,79.76,98.52,80.74
3,Delhi,M0.5,13686,11866,11732,9702,70.89,85.72,86.7,98.87,82.7
4,Hyderabad,M0,32280,24071,23714,19716,61.08,73.46,74.57,98.52,83.14
5,Hyderabad,M0.5,26604,23147,22871,19385,72.86,85.97,87.01,98.81,84.76
6,Kolkata,M0,4376,3244,3179,2271,51.9,72.65,74.13,98.0,71.44
7,Kolkata,M0.5,3898,3187,3112,2319,59.49,79.84,81.76,97.65,74.52


![Shared Image](images/reference_image_1.png)

## Hypothesis - 1

### H0 - Users are not aware of all the services available and with this redesign the discovery has Improved and this leads to increase in AO2FE. <br><br> Check for only one service affine customers - if their AO2FE or FE2RR has also Improved

In [44]:
df_ao_to_rr.head(3)

Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,fare_estimate_id,service_name,service_detail_id,rr_customer,rr_fare_estimate_id,rr_service_name,order_id,service_affinity
0,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bbfab401821b8ac67ec,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK
1,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,57370b61a6855d70057417d1,61716eb34c6ba16017db28e4,65359bc73a9dc26fbbd3c1ff,Link,65359bd64c28820c53ac3784,ONLY_LINK
2,Bangalore,M0,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,61716eb34c6ba16017db28e4,65359c32fd3e49203c7fdee7,Link,57370b61a6855d70057417d1,,,,,ONLY_LINK


In [45]:
df_ao_to_rr.service_affinity.unique()

array(['ONLY_LINK', 'UNKNOWN', 'ONLY_AUTO', 'BOTH'], dtype=object)

In [46]:
df_ao_to_rr_service_aff = df_ao_to_rr[df_ao_to_rr['service_affinity'].isin(['ONLY_LINK','ONLY_AUTO'])]
df_ao_to_rr_service_aff.service_affinity.unique()

array(['ONLY_LINK', 'ONLY_AUTO'], dtype=object)

In [47]:
df_ao2rr_srvc_aff =  df_ao_to_rr_service_aff \
                        .groupby(['currentCity', 'screen_version']) \
                        .agg({'appOpen_userId': 'nunique', 
                              'searchaddress_userId' : 'nunique',
                              'fe_userId' : 'nunique',
                              'rr_customer' : 'nunique'
                             }).reset_index()

df_ao2rr_srvc_aff['AO2RR'] = df_ao2rr_srvc_aff['rr_customer']*100.0/df_ao2rr_srvc_aff['appOpen_userId']
df_ao2rr_srvc_aff['AO2FE'] = df_ao2rr_srvc_aff['fe_userId']*100.0/df_ao2rr_srvc_aff['appOpen_userId']
df_ao2rr_srvc_aff['AO2SA'] = df_ao2rr_srvc_aff['searchaddress_userId']*100.0/df_ao2rr_srvc_aff['appOpen_userId']
df_ao2rr_srvc_aff['SA2FE'] = df_ao2rr_srvc_aff['fe_userId']*100.0/df_ao2rr_srvc_aff['searchaddress_userId']
df_ao2rr_srvc_aff['FE2RR'] = df_ao2rr_srvc_aff['rr_customer']*100.0/df_ao2rr_srvc_aff['fe_userId']


print('\n\nOnly Link & Only Auto Service Affinity customers \n --------------------')
df_ao2rr_srvc_aff.round(2)



Only Link & Only Auto Service Affinity customers 
 --------------------


Unnamed: 0,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,rr_customer,AO2RR,AO2FE,AO2SA,SA2FE,FE2RR
0,Bangalore,M0,8880,7120,7044,6098,68.67,79.32,80.18,98.93,86.57
1,Bangalore,M0.5,7715,6767,6709,5896,76.42,86.96,87.71,99.14,87.88
2,Delhi,M0,10523,8833,8746,7566,71.9,83.11,83.94,99.02,86.51
3,Delhi,M0.5,9815,8893,8827,7772,79.18,89.93,90.61,99.26,88.05
4,Hyderabad,M0,22947,17697,17466,14886,64.87,76.11,77.12,98.69,85.23
5,Hyderabad,M0.5,19011,16946,16785,14577,76.68,88.29,89.14,99.05,86.85
6,Kolkata,M0,2843,2221,2193,1742,61.27,77.14,78.12,98.74,79.43
7,Kolkata,M0.5,2577,2234,2207,1809,70.2,85.64,86.69,98.79,81.97



       Improvement in both AO2FE and FE2RR 

In [48]:
df1_ao2rr_srvc_aff =  df_ao_to_rr_service_aff \
                        .groupby(['service_affinity', 'currentCity', 'screen_version']) \
                        .agg({'appOpen_userId': 'nunique', 
                              'searchaddress_userId' : 'nunique',
                              'fe_userId' : 'nunique',
                              'rr_customer' : 'nunique'
                             }).reset_index()

df1_ao2rr_srvc_aff['AO2RR'] = df1_ao2rr_srvc_aff['rr_customer']*100.0/df1_ao2rr_srvc_aff['appOpen_userId']
df1_ao2rr_srvc_aff['AO2FE'] = df1_ao2rr_srvc_aff['fe_userId']*100.0/df1_ao2rr_srvc_aff['appOpen_userId']
df1_ao2rr_srvc_aff['AO2SA'] = df1_ao2rr_srvc_aff['searchaddress_userId']*100.0/df1_ao2rr_srvc_aff['appOpen_userId']
df1_ao2rr_srvc_aff['SA2FE'] = df1_ao2rr_srvc_aff['fe_userId']*100.0/df1_ao2rr_srvc_aff['searchaddress_userId']
df1_ao2rr_srvc_aff['FE2RR'] = df1_ao2rr_srvc_aff['rr_customer']*100.0/df1_ao2rr_srvc_aff['fe_userId']


print('\n\nOnly Link & Only Auto Service Affinity customers \n --------------------')
df1_ao2rr_srvc_aff.round(2)



Only Link & Only Auto Service Affinity customers 
 --------------------


Unnamed: 0,service_affinity,currentCity,screen_version,appOpen_userId,searchaddress_userId,fe_userId,rr_customer,AO2RR,AO2FE,AO2SA,SA2FE,FE2RR
0,ONLY_AUTO,Bangalore,M0,3996,3176,3135,2721,68.09,78.45,79.48,98.71,86.79
1,ONLY_AUTO,Bangalore,M0.5,3434,3010,2986,2633,76.67,86.95,87.65,99.2,88.18
2,ONLY_AUTO,Delhi,M0,1226,985,972,767,62.56,79.28,80.34,98.68,78.91
3,ONLY_AUTO,Delhi,M0.5,1100,982,971,800,72.73,88.27,89.27,98.88,82.39
4,ONLY_AUTO,Hyderabad,M0,10077,7740,7650,6547,64.97,75.92,76.81,98.84,85.58
5,ONLY_AUTO,Hyderabad,M0.5,8239,7314,7235,6260,75.98,87.81,88.77,98.92,86.52
6,ONLY_AUTO,Kolkata,M0,74,48,46,29,39.19,62.16,64.86,95.83,63.04
7,ONLY_AUTO,Kolkata,M0.5,69,52,50,28,40.58,72.46,75.36,96.15,56.0
8,ONLY_LINK,Bangalore,M0,4884,3944,3909,3377,69.14,80.04,80.75,99.11,86.39
9,ONLY_LINK,Bangalore,M0.5,4281,3757,3723,3263,76.22,86.97,87.76,99.1,87.64


<br>
<br>

## Other 
### Check for service funnel baselines from M0 version in M0.5 we should see better funnel only for new services

In [49]:
## appOpen & searchaddress
df_hypo2a = df_ao_to_rr \
                .groupby(['currentCity', 'screen_version']) \
                .agg(appOpen = pd.NamedAgg('appOpen_userId', 'nunique'),
                     searchaddress = pd.NamedAgg('searchaddress_userId', 'nunique')) \
                .reset_index()

## fe
df_hypo2b = df_ao_to_rr \
                .groupby(['currentCity', 'service_name', 'screen_version']) \
                .agg(fe_customers = pd.NamedAgg('fe_userId', 'nunique')) \
                .reset_index()

## rr
df_hypo2c = df_ao_to_rr \
                .groupby(['currentCity', 'rr_service_name', 'screen_version']) \
                .agg(rr_customers = pd.NamedAgg('rr_customer', 'nunique')) \
                .reset_index()

In [50]:
df_initial_merge = pd.merge(df_hypo2a,
                            df_hypo2b,
                            how='left',
                            on=['currentCity','screen_version']
                           )
df_final_merge = pd.merge(df_initial_merge,
                          df_hypo2c,
                          how='left',
                          left_on=['currentCity', 'screen_version', 'service_name'],
                          right_on=['currentCity', 'screen_version', 'rr_service_name']
                         )
df_final_merge = df_final_merge.dropna()

In [51]:
df_final_merge['AO2RR'] = df_final_merge['rr_customers']*100.0/df_final_merge['appOpen']
df_final_merge['AO2FE'] = df_final_merge['fe_customers']*100.0/df_final_merge['appOpen']
df_final_merge['AO2SA'] = df_final_merge['searchaddress']*100.0/df_final_merge['appOpen']
df_final_merge['SA2FE'] = df_final_merge['fe_customers']*100.0/df_final_merge['searchaddress']
df_final_merge['FE2RR'] = df_final_merge['rr_customers']*100.0/df_final_merge['fe_customers']

df_final_merge[['currentCity','service_name','screen_version',
                'appOpen', 'searchaddress', 'fe_customers', 'rr_customers',
                'AO2SA', 'AO2FE', 'AO2RR', 'SA2FE', 'FE2RR'
               ]] \
            .sort_values(['service_name', 'currentCity', 'screen_version']) \
            .round(2)

Unnamed: 0,currentCity,service_name,screen_version,appOpen,searchaddress,fe_customers,rr_customers,AO2SA,AO2FE,AO2RR,SA2FE,FE2RR
0,Bangalore,Auto,M0,13135,10113,9934,5547.0,76.99,75.63,42.23,98.23,55.84
7,Bangalore,Auto,M0.5,11485,9768,9606,5472.0,85.05,83.64,47.64,98.34,56.96
14,Delhi,Auto,M0,15044,11999,11076,3308.0,79.76,73.62,21.99,92.31,29.87
20,Delhi,Auto,M0.5,13686,11866,11030,3281.0,86.7,80.59,23.97,92.95,29.75
26,Hyderabad,Auto,M0,32280,24071,23510,11616.0,74.57,72.83,35.99,97.67,49.41
32,Hyderabad,Auto,M0.5,26604,23147,22696,11351.0,87.01,85.31,42.67,98.05,50.01
2,Bangalore,AutoPremium,M0,13135,10113,3490,306.0,76.99,26.57,2.33,34.51,8.77
9,Bangalore,AutoPremium,M0.5,11485,9768,3508,312.0,85.05,30.54,2.72,35.91,8.89
27,Hyderabad,AutoPremium,M0,32280,24071,23405,1032.0,74.57,72.51,3.2,97.23,4.41
33,Hyderabad,AutoPremium,M0.5,26604,23147,22601,1015.0,87.01,84.95,3.82,97.64,4.49


Take 

AO2FE
- M0 AO2FE < M0.5 AO2FE



FE2RR
- Link FE2RR Improved across all cities  
- approx Significance difference -> (Kolkata-C2C, Bangalore-Auto)
- No Significance difference -> (Auto, AutoPremium, C2C, CabEconomy) 
- Dip in FE2RR -> (Bike Lite, Bike Metro)

### Only one service customers

In [52]:
df_hypo2_service_aff = df_ao_to_rr[df_ao_to_rr['service_affinity'].isin(['ONLY_LINK','ONLY_AUTO'])]
df_hypo2_service_aff.service_affinity.unique()

array(['ONLY_LINK', 'ONLY_AUTO'], dtype=object)

In [53]:
## appOpen & searchaddress
df_hypo2a_ser_aff = df_hypo2_service_aff \
                        .groupby(['currentCity', 'screen_version']) \
                        .agg(appOpen = pd.NamedAgg('appOpen_userId', 'nunique'),
                             searchaddress = pd.NamedAgg('searchaddress_userId', 'nunique')) \
                        .reset_index()

## fe
df_hypo2b_ser_aff = df_hypo2_service_aff \
                        .groupby(['currentCity', 'service_name', 'screen_version']) \
                        .agg(fe_customers = pd.NamedAgg('fe_userId', 'nunique')) \
                        .reset_index()

## rr
df_hypo2c_ser_aff = df_hypo2_service_aff \
                        .groupby(['currentCity', 'rr_service_name', 'screen_version']) \
                        .agg(rr_customers = pd.NamedAgg('rr_customer', 'nunique')) \
                        .reset_index()

In [54]:
df_initial_merge_ser_aff = pd.merge(df_hypo2a_ser_aff,
                            df_hypo2b_ser_aff,
                            how='left',
                            on=['currentCity','screen_version']
                           )
df_final_merge_ser_aff = pd.merge(df_initial_merge_ser_aff,
                          df_hypo2c_ser_aff,
                          how='left',
                          left_on=['currentCity', 'screen_version', 'service_name'],
                          right_on=['currentCity', 'screen_version', 'rr_service_name']
                         )
df_final_merge_ser_aff = df_final_merge_ser_aff.dropna()

In [55]:
df_final_merge_ser_aff['AO2RR'] = df_final_merge_ser_aff['rr_customers']*100.0/df_final_merge_ser_aff['appOpen']
df_final_merge_ser_aff['AO2FE'] = df_final_merge_ser_aff['fe_customers']*100.0/df_final_merge_ser_aff['appOpen']
df_final_merge_ser_aff['AO2SA'] = df_final_merge_ser_aff['searchaddress']*100.0/df_final_merge_ser_aff['appOpen']
df_final_merge_ser_aff['SA2FE'] = df_final_merge_ser_aff['fe_customers']*100.0/df_final_merge_ser_aff['searchaddress']
df_final_merge_ser_aff['FE2RR'] = df_final_merge_ser_aff['rr_customers']*100.0/df_final_merge_ser_aff['fe_customers']

df_final_merge_ser_aff[['currentCity','service_name','screen_version',
                        'appOpen', 'searchaddress', 'fe_customers', 'rr_customers',
                        'AO2SA', 'AO2FE', 'AO2RR', 'SA2FE', 'FE2RR'
                       ]] \
                    .sort_values(['service_name', 'currentCity', 'screen_version']) \
                    .round(2)

Unnamed: 0,currentCity,service_name,screen_version,appOpen,searchaddress,fe_customers,rr_customers,AO2SA,AO2FE,AO2RR,SA2FE,FE2RR
0,Bangalore,Auto,M0,8880,7120,7007,4090.0,80.18,78.91,46.06,98.41,58.37
7,Bangalore,Auto,M0.5,7715,6767,6670,3984.0,87.71,86.45,51.64,98.57,59.73
14,Delhi,Auto,M0,10523,8833,8274,2460.0,83.94,78.63,23.38,93.67,29.73
20,Delhi,Auto,M0.5,9815,8893,8403,2428.0,90.61,85.61,24.74,94.49,28.89
26,Hyderabad,Auto,M0,22947,17697,17300,8729.0,77.12,75.39,38.04,97.76,50.46
32,Hyderabad,Auto,M0.5,19011,16946,16643,8513.0,89.14,87.54,44.78,98.21,51.15
2,Bangalore,AutoPremium,M0,8880,7120,2512,231.0,80.18,28.29,2.6,35.28,9.2
9,Bangalore,AutoPremium,M0.5,7715,6767,2493,231.0,87.71,32.31,2.99,36.84,9.27
27,Hyderabad,AutoPremium,M0,22947,17697,17223,762.0,77.12,75.06,3.32,97.32,4.42
33,Hyderabad,AutoPremium,M0.5,19011,16946,16568,781.0,89.14,87.15,4.11,97.77,4.71
