## <center> Packages & Connection </center>

In [1]:
import os
import h3 as h3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from pyhive import presto
from keplergl import KeplerGl
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)

## <center> Local extracted files </center>

In [3]:
notebook_path = os.getcwd()
print(notebook_path)

'''/Users/rapido/local-datasets/affluence/raw/'''

/Users/rapido/code-repository/pricing/affluence


'/Users/rapido/local-datasets/affluence/raw/'

In [4]:
raw_order_logs_snapshot = pd.read_csv('/Users/rapido/local-datasets/affluence/raw/raw_order_logs_snapshot_10to16.csv')
raw_order_logs_immutable = pd.read_csv('/Users/rapido/local-datasets/affluence/raw/raw_order_logs_immutable_10to16.csv')
raw_fare_estimates_enriched = pd.read_csv('/Users/rapido/local-datasets/affluence/raw/raw_fare_estimates_enriched_10to16.csv')
raw_iallocator_customer_segments = pd.read_csv('/Users/rapido/local-datasets/affluence/raw/raw_iallocator_customer_segments_10to16.csv')

In [5]:
df_oli_main_data = raw_order_logs_immutable.copy(deep=True)
print(df_oli_main_data.shape)

df_ols_main_data = raw_order_logs_snapshot.copy(deep=True)
print(df_ols_main_data.shape)

df_fee_main_data = raw_fare_estimates_enriched.copy(deep=True)
print(df_fee_main_data.shape)

df_segment_main_data = raw_iallocator_customer_segments.copy(deep=True)
print(df_segment_main_data.shape)

(3536034, 16)
(1149637, 12)
(15117222, 17)
(14739326, 15)


In [6]:
df_fee_main_data.head(2)

Unnamed: 0,yyyymmdd,city,service_name,quarter_hour,time_bucket,pickup_cluster,pickup_hex_8,drop_cluster,drop_hex_8,customer_id,fare_estimate_id,surge_strategy,surge_percentage,dynamic_surge,dynamic_fare,sub_total,discount_amount
0,20230710,Bangalore,Auto,2015,Evening_Peak,Bellandur,8861892569fffff,Varthur,88618920ebfffff,5e591dcc1275a31e1e5833c9,64ac1af42a70c7ac0b2b56f8,surge_dashboard_rule,10.204082,0.0,10.0,108.0,0.0
1,20230710,Bangalore,Link,1800,Evening_Peak,Basavanagudi,8861892597fffff,Cambridge Layout Ulsoor,8861892ed1fffff,62c843ef0eef7f6e35f11140,64abfc344f76e21a1142b7dc,rain,39.517694,42.77,0.0,151.0,10.0


In [7]:
df_ols_main_data.head(2)

Unnamed: 0,yyyymmdd,city,service_name,pickup_cluster,pickup_hex_8,drop_cluster,drop_hex_8,estimate_id,order_id,geo_distance,discount,sub_total
0,20230713,Bangalore,Link,Ramaiah College,8860145941fffff,Metro - Sandal Soap Factory,8860145b37fffff,64aff1369c5b35742a429410,64afee9fb82ebd52fc9898da,0.086537,14.0,67.0
1,20230713,Bangalore,Auto,RTO HSR Sec 1,8861892553fffff,Kudlu Gate,886189246bfffff,64aff32f918567f17f3cfa3c,64aff3345332de3f2f5b9e0f,0.013049,0.0,79.0


In [8]:
df_oli_main_data.head(2)

Unnamed: 0,yyyymmdd,city,service_name,pickup_cluster,pickup_hex_8,drop_cluster,drop_hex_8,estimate_id,gross_orders,requested_orders,cobrm,expiry_mapped,cobra,accepted_orders,ocara,net_orders
0,20230710,Bangalore,Link,HSR Sec 2,886189242dfffff,BTM,88618925ddfffff,64ac0b367912742548bb4557,1,1,0,1,0,0,0,0
1,20230710,Bangalore,Auto,Electronic City,8861892663fffff,Electronic City Phase 2,8861892665fffff,64ac094676627a108e685bd2,1,1,0,0,1,0,0,0


In [9]:
df_segment_main_data.head(2)

Unnamed: 0,yyyymmdd,customer_id,taxi_high_income,taxi_medium_income,taxi_low_income,link_only_service,auto_only_service,both_service,link_ps,link_nps,auto_ps,auto_nps,fe_intent_stable,fe_intent_increasing,fe_intent_declining
0,20230710,5737c6baddbec2203f7331d9,,,,,,5737c6baddbec2203f7331d9,,5737c6baddbec2203f7331d9,5737c6baddbec2203f7331d9,,5737c6baddbec2203f7331d9,,
1,20230710,5737c6c1ddbec2203f73321e,,5737c6c1ddbec2203f73321e,,5737c6c1ddbec2203f73321e,,,,,,,,,5737c6c1ddbec2203f73321e


In [10]:
## fee & oli

df_fe_rr_data = pd.merge(df_fee_main_data[df_fee_main_data['service_name'].isin(['Link'])], \
                             df_oli_main_data[df_oli_main_data['service_name'].isin(['Link'])], \
                             how='left', \
                             left_on = ['yyyymmdd', 'city', 'service_name', 'fare_estimate_id'], \
                             right_on = ['yyyymmdd', 'city', 'service_name', 'estimate_id'] \
                            )

In [11]:
df_fe_rr_data[df_fe_rr_data['fare_estimate_id'].isin(['64ace765126ec564e4a789ef','64b201d727b0e89d3cda1123'])]

Unnamed: 0,yyyymmdd,city,service_name,quarter_hour,time_bucket,pickup_cluster_x,pickup_hex_8_x,drop_cluster_x,drop_hex_8_x,customer_id,fare_estimate_id,surge_strategy,surge_percentage,dynamic_surge,dynamic_fare,sub_total,discount_amount,pickup_cluster_y,pickup_hex_8_y,drop_cluster_y,drop_hex_8_y,estimate_id,gross_orders,requested_orders,cobrm,expiry_mapped,cobra,accepted_orders,ocara,net_orders
1999736,20230711,Bangalore,Link,1045,Morning_Peak,Columbia Asia Whitefield,88618920e7fffff,Whitefield,88618921d3fffff,5fb36b6d7a3a3547d26d5744,64ace765126ec564e4a789ef,mismatch_gradient,21.95122,9.0,0.0,50.0,0.0,Columbia Asia Whitefield,88618920e7fffff,Whitefield,88618921d3fffff,64ace765126ec564e4a789ef,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
6074461,20230715,Bangalore,Link,745,Rest_Morning,Varthur,88618920c1fffff,Kadubeesanahalli,8861892095fffff,625125b349bc1231f17f0c33,64b201d727b0e89d3cda1123,hot_cold_surge,36.843671,24.77,0.0,92.0,0.0,Varthur,88618920c1fffff,Kadubeesanahalli,8861892095fffff,64b201d727b0e89d3cda1123,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [12]:
df_fe_rr_data.columns

Index(['yyyymmdd', 'city', 'service_name', 'quarter_hour', 'time_bucket',
       'pickup_cluster_x', 'pickup_hex_8_x', 'drop_cluster_x', 'drop_hex_8_x',
       'customer_id', 'fare_estimate_id', 'surge_strategy', 'surge_percentage',
       'dynamic_surge', 'dynamic_fare', 'sub_total', 'discount_amount',
       'pickup_cluster_y', 'pickup_hex_8_y', 'drop_cluster_y', 'drop_hex_8_y',
       'estimate_id', 'gross_orders', 'requested_orders', 'cobrm',
       'expiry_mapped', 'cobra', 'accepted_orders', 'ocara', 'net_orders'],
      dtype='object')

In [13]:
df_fe_rr_data.rename(columns = {'pickup_cluster_x' : 'pickup_cluster', \
                                'pickup_hex_8_x' : 'pickup_hex_8', \
                                'drop_cluster_x' : 'drop_cluster', \
                                'drop_hex_8_x' : 'drop_hex_8', \
                                'discount_amount' : 'discount_amount_fe' \
                               }, 
                    inplace = True
                    )

In [14]:
df_fe_rr_data = df_fe_rr_data[['yyyymmdd', 'city', 'service_name', 'quarter_hour', 'time_bucket', \
                               'pickup_cluster', 'pickup_hex_8', 'drop_cluster', 'drop_hex_8', \
                               'customer_id', 'fare_estimate_id', 'surge_strategy', 'surge_percentage', \
                               'dynamic_surge', 'dynamic_fare', 'sub_total', 'discount_amount_fe', \
                               'estimate_id', 'gross_orders', 'requested_orders', 'cobrm', \
                               'expiry_mapped', 'cobra', 'accepted_orders', 'ocara', 'net_orders']]

In [15]:
df_fe_rr_data.head(2)

Unnamed: 0,yyyymmdd,city,service_name,quarter_hour,time_bucket,pickup_cluster,pickup_hex_8,drop_cluster,drop_hex_8,customer_id,fare_estimate_id,surge_strategy,surge_percentage,dynamic_surge,dynamic_fare,sub_total,discount_amount_fe,estimate_id,gross_orders,requested_orders,cobrm,expiry_mapped,cobra,accepted_orders,ocara,net_orders
0,20230710,Bangalore,Link,1800,Evening_Peak,Basavanagudi,8861892597fffff,Cambridge Layout Ulsoor,8861892ed1fffff,62c843ef0eef7f6e35f11140,64abfc344f76e21a1142b7dc,rain,39.517694,42.77,0.0,151.0,10.0,,,,,,,,,
1,20230710,Bangalore,Link,1015,Morning_Peak,Basaveshwar Nagar,886014586bfffff,Yeshwanthpur,886014595bfffff,6399734853245a499edb0326,64ab8e968d1303f14c3e0cf9,no_surge,0.0,0.0,0.0,47.0,0.0,,,,,,,,,


In [16]:
## fe_rr & ols

df_fe_net_data = pd.merge(df_fe_rr_data, \
                             df_ols_main_data[df_ols_main_data['service_name'].isin(['Link'])], \
                             how='left', \
                             left_on = ['yyyymmdd', 'city', 'service_name', 'fare_estimate_id'], \
                             right_on = ['yyyymmdd', 'city', 'service_name', 'estimate_id'] \
                            )

In [17]:
df_fe_net_data.columns

Index(['yyyymmdd', 'city', 'service_name', 'quarter_hour', 'time_bucket',
       'pickup_cluster_x', 'pickup_hex_8_x', 'drop_cluster_x', 'drop_hex_8_x',
       'customer_id', 'fare_estimate_id', 'surge_strategy', 'surge_percentage',
       'dynamic_surge', 'dynamic_fare', 'sub_total_x', 'discount_amount_fe',
       'estimate_id_x', 'gross_orders', 'requested_orders', 'cobrm',
       'expiry_mapped', 'cobra', 'accepted_orders', 'ocara', 'net_orders',
       'pickup_cluster_y', 'pickup_hex_8_y', 'drop_cluster_y', 'drop_hex_8_y',
       'estimate_id_y', 'order_id', 'geo_distance', 'discount', 'sub_total_y'],
      dtype='object')

In [18]:
df_fe_net_data.rename(columns = {'pickup_cluster_x' : 'pickup_cluster', \
                                'pickup_hex_8_x' : 'pickup_hex_8', \
                                'drop_cluster_x' : 'drop_cluster', \
                                'drop_hex_8_x' : 'drop_hex_8', \
                                'estimate_id_x' : 'estimate_id', \
                                'sub_total_x' : 'sub_total_fe', \
                                'sub_total_y' : 'sub_total_gross', \
                                'discount_amount' : 'discount_amount_fe', \
                                'discount' : 'discount_gross'
                               }, 
                    inplace = True
                    )

In [19]:
# New columns
df_fe_net_data['surge_strategy_applied'] = np.where( \
                                                     df_fe_net_data['surge_strategy'] \
                                                     .isin(['rain', 'mismatch_generic', 'hot_cold_surge', \
                                                            'mismatch_gradient', 'circuit_broken', \
                                                            'surge_dashboard_rule']),    
                                                     df_fe_net_data['fare_estimate_id'],
                                                     None)
df_fe_net_data['geo_distance_1'] = df_fe_net_data['geo_distance']

In [20]:
df_refined_data =  df_fe_net_data.groupby(['pickup_cluster', 'pickup_hex_8']).agg({'fare_estimate_id' : 'nunique', \
                                                                 'gross_orders' : 'sum', \
                                                                 'order_id' : 'nunique', \
                                                                 'surge_strategy_applied' : 'nunique', \
                                                                 'surge_percentage' : 'mean', \
                                                                 'discount_amount_fe' : 'sum', \
                                                                 'discount_gross' : 'sum', \
                                                                 'sub_total_fe' : 'sum', \
                                                                 'customer_id' : 'nunique', \
                                                                 'geo_distance' : 'mean', \
                                                                 'geo_distance_1' : 'max' \
                                                                }).reset_index()
df_refined_data.head(5)
#df_refined_data.columns = df_refined_data.columns.droplevel(1)
df_refined_data.rename(columns = {'fare_estimate_id' : 'fe_count', \
                                  'order_id' : 'net_orders',
                                  'surge_strategy_applied' : 'surged_fe', \
                                  'customer_id' : 'fe_customer_count', \
                                  'geo_distance' : 'mean_geo_distance', \
                                  'geo_distance_1' : 'max_geo_distance' \
                                 }, inplace = True)

In [21]:
df_refined_data.head(5)

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance
0,AECS Layout Brookefield,88618920a3fffff,40894,9296.0,4076,17523,12.15035,84670.0,27540.0,4382022.0,10354,0.033276,8.421555
1,Adugodi,8861892581fffff,16022,2800.0,1016,4570,6.840293,35883.0,7374.0,1498201.0,3934,0.038787,3.909074
2,Adugodi,886189258bfffff,19192,3468.0,1049,6057,8.488439,46275.0,6850.0,1840822.0,4230,0.050087,1.307781
3,Adugodi Traffic Station,886189258dfffff,25303,4315.0,1498,4062,3.597522,52512.0,3191.0,2135209.0,5771,0.035588,0.886427
4,Agara Lake,8861892425fffff,18454,3483.0,1375,6368,8.956572,38155.0,11506.0,1724176.0,4783,0.028703,1.060272


In [22]:
# new columns 
df_refined_data['fe2rr'] = round(df_refined_data['gross_orders']*100.00/df_refined_data['fe_count'], 2)
df_refined_data['fe2net'] = round(df_refined_data['net_orders']*100.00/df_refined_data['fe_count'], 2)
df_refined_data['g2n'] = round(df_refined_data['net_orders']*100.00/df_refined_data['gross_orders'], 2)

In [23]:
df_refined_data.head(5)

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance,fe2rr,fe2net,g2n
0,AECS Layout Brookefield,88618920a3fffff,40894,9296.0,4076,17523,12.15035,84670.0,27540.0,4382022.0,10354,0.033276,8.421555,22.73,9.97,43.85
1,Adugodi,8861892581fffff,16022,2800.0,1016,4570,6.840293,35883.0,7374.0,1498201.0,3934,0.038787,3.909074,17.48,6.34,36.29
2,Adugodi,886189258bfffff,19192,3468.0,1049,6057,8.488439,46275.0,6850.0,1840822.0,4230,0.050087,1.307781,18.07,5.47,30.25
3,Adugodi Traffic Station,886189258dfffff,25303,4315.0,1498,4062,3.597522,52512.0,3191.0,2135209.0,5771,0.035588,0.886427,17.05,5.92,34.72
4,Agara Lake,8861892425fffff,18454,3483.0,1375,6368,8.956572,38155.0,11506.0,1724176.0,4783,0.028703,1.060272,18.87,7.45,39.48


In [24]:
## low and high aff tag
raw_list_hex_8 = pd.read_csv('list_hex_8.csv')
df_list_hex_8 = raw_list_hex_8.copy(deep=True)
print(df_list_hex_8.shape)
df_list_hex_8.head(2)

(2684, 3)


Unnamed: 0,pickup_hex_8,income_signal,density_signal
0,88618920a3fffff,High Income,High FE
1,8861892581fffff,High Income,High FE


In [25]:
df_refined_data_with_tag = pd.merge (df_refined_data, \
                                     df_list_hex_8, \
                                     how = 'left', \
                                     left_on = ['pickup_hex_8'], \
                                     right_on = ['pickup_hex_8']
                                    )
df_refined_data_with_tag = df_refined_data_with_tag[df_refined_data_with_tag['gross_orders'] > 0]

In [26]:
df_refined_data_with_tag.groupby(['income_signal', 'density_signal']).agg({'pickup_hex_8' : 'count', \
                                                                           'fe_count' : 'sum', \
                                                                           'gross_orders' : 'sum', \
                                                                           'net_orders' : 'sum', \
                                                                           'surged_fe' : 'sum' \
                                                                          })

Unnamed: 0_level_0,Unnamed: 1_level_0,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe
income_signal,density_signal,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
High Income,High FE,490,6320789,1162054.0,456850,2744102
High Income,Low FE,521,473153,89756.0,35846,171127
Low Income,High FE,45,382073,64620.0,25012,156438
Low Income,Low FE,945,339698,61600.0,22672,124795


In [27]:
df_refined_data_with_tag_v1 = df_refined_data_with_tag[df_refined_data_with_tag['density_signal'] \
                                                       .isin(['High FE']) \
                                                      &
                                                      df_refined_data_with_tag['income_signal'] \
                                                       .isin(['High Income']) \
                                                      ]
df_refined_data_with_tag_v2 = df_refined_data_with_tag[df_refined_data_with_tag['density_signal'] \
                                                       .isin(['High FE']) \
                                                      &
                                                      df_refined_data_with_tag['income_signal'] \
                                                       .isin(['Low Income']) 
                                                      ]

## Low aff 45 hex's

In [28]:
df_low_income_group = df_refined_data_with_tag_v2.sort_values(['fe2net'], ascending = False)
df_low_income_group['row_number'] = df_low_income_group.reset_index().index
df_low_income_group['group_tc'] = np.where( \
                                           (df_low_income_group['row_number'] % 2) == 0,  \
                                            'test', 'control' )
df_low_income_group = df_low_income_group.drop(['row_number'], axis=1)
df_low_income_group.head(5)

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance,fe2rr,fe2net,g2n,income_signal,density_signal,group_tc
2683,Yeshwanthpur - Mathikere,886014594bfffff,11222,2501.0,1228,2145,3.664665,25445.0,12323.0,1297564.0,3833,0.055065,2.787774,22.29,10.94,49.1,Low Income,High FE,test
579,Basaveshwar Nagar,8860145863fffff,3781,862.0,393,1343,7.768575,7498.0,4107.0,398086.0,1192,0.043937,2.414475,22.8,10.39,45.59,Low Income,High FE,control
2498,Thanisandra,8861892ca3fffff,4282,967.0,441,1756,8.710967,9130.0,4627.0,531822.0,1178,0.038976,0.687053,22.58,10.3,45.6,Low Income,High FE,test
2031,Peenya,8860145829fffff,3246,729.0,330,1041,6.384303,6361.0,3312.0,353466.0,1120,0.039365,0.707666,22.46,10.17,45.27,Low Income,High FE,control
1792,Mysore rd,8860145a05fffff,3992,897.0,395,768,4.1657,8352.0,4409.0,394007.0,1268,0.02921,0.34459,22.47,9.89,44.04,Low Income,High FE,test


In [29]:
df_low_income_group_summary = df_low_income_group.groupby(['group_tc']).agg({'pickup_hex_8' : 'count',
                                                'net_orders' : 'sum', 
                                                'fe_count' : 'sum',
                                                'gross_orders' : 'sum'
                                                  })
df_low_income_group_summary['g2n'] = df_low_income_group_summary['net_orders']*100.0/df_low_income_group_summary['gross_orders']
df_low_income_group_summary['fe2rr'] = df_low_income_group_summary['gross_orders']*100.0/df_low_income_group_summary['fe_count']
df_low_income_group_summary['fe2net'] = df_low_income_group_summary['net_orders']*100.0/df_low_income_group_summary['fe_count']
df_low_income_group_summary

Unnamed: 0_level_0,pickup_hex_8,net_orders,fe_count,gross_orders,g2n,fe2rr,fe2net
group_tc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
control,22,12546,188181,32635.0,38.443389,17.342346,6.666986
test,23,12466,193892,31985.0,38.974519,16.496297,6.429352


In [30]:
df_low_income_group

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance,fe2rr,fe2net,g2n,income_signal,density_signal,group_tc
2683,Yeshwanthpur - Mathikere,886014594bfffff,11222,2501.0,1228,2145,3.664665,25445.0,12323.0,1297564.0,3833,0.055065,2.787774,22.29,10.94,49.1,Low Income,High FE,test
579,Basaveshwar Nagar,8860145863fffff,3781,862.0,393,1343,7.768575,7498.0,4107.0,398086.0,1192,0.043937,2.414475,22.8,10.39,45.59,Low Income,High FE,control
2498,Thanisandra,8861892ca3fffff,4282,967.0,441,1756,8.710967,9130.0,4627.0,531822.0,1178,0.038976,0.687053,22.58,10.3,45.6,Low Income,High FE,test
2031,Peenya,8860145829fffff,3246,729.0,330,1041,6.384303,6361.0,3312.0,353466.0,1120,0.039365,0.707666,22.46,10.17,45.27,Low Income,High FE,control
1792,Mysore rd,8860145a05fffff,3992,897.0,395,768,4.1657,8352.0,4409.0,394007.0,1268,0.02921,0.34459,22.47,9.89,44.04,Low Income,High FE,test
1478,Jakkur,8860169669fffff,3937,846.0,366,1542,8.272049,8802.0,614.0,474878.0,1514,0.060707,1.578752,21.49,9.3,43.26,Low Income,High FE,control
1713,Mathikere Lake,8860145943fffff,4090,837.0,379,805,3.114422,9994.0,3813.0,456494.0,1321,0.054119,5.816615,20.46,9.27,45.28,Low Income,High FE,test
888,Chikkagobbi,8861892dd5fffff,3257,603.0,297,1223,8.320583,8356.0,3626.0,482374.0,1164,0.061894,3.836679,18.51,9.12,49.25,Low Income,High FE,control
2603,Vijayanagar,886014584dfffff,3675,698.0,331,1192,7.806637,9278.0,3706.0,426919.0,1286,0.03234,0.667984,18.99,9.01,47.42,Low Income,High FE,test
2020,Parappana Agrahara,886189244dfffff,10326,2072.0,911,3736,7.253473,22059.0,9339.0,1121522.0,3040,0.042807,3.189051,20.07,8.82,43.97,Low Income,High FE,control


## High aff 25 hex's

In [31]:
df_refined_data_with_tag_v1.head(5)

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance,fe2rr,fe2net,g2n,income_signal,density_signal
0,AECS Layout Brookefield,88618920a3fffff,40894,9296.0,4076,17523,12.15035,84670.0,27540.0,4382022.0,10354,0.033276,8.421555,22.73,9.97,43.85,High Income,High FE
1,Adugodi,8861892581fffff,16022,2800.0,1016,4570,6.840293,35883.0,7374.0,1498201.0,3934,0.038787,3.909074,17.48,6.34,36.29,High Income,High FE
2,Adugodi,886189258bfffff,19192,3468.0,1049,6057,8.488439,46275.0,6850.0,1840822.0,4230,0.050087,1.307781,18.07,5.47,30.25,High Income,High FE
3,Adugodi Traffic Station,886189258dfffff,25303,4315.0,1498,4062,3.597522,52512.0,3191.0,2135209.0,5771,0.035588,0.886427,17.05,5.92,34.72,High Income,High FE
4,Agara Lake,8861892425fffff,18454,3483.0,1375,6368,8.956572,38155.0,11506.0,1724176.0,4783,0.028703,1.060272,18.87,7.45,39.48,High Income,High FE


In [32]:
# level 1 
df_high_income_group = df_refined_data_with_tag_v1.sort_values(['fe2net'], ascending = False)
df_high_income_group['row_number'] = df_high_income_group.reset_index().index
df_high_income_group['controller'] = np.where( \
                                           (df_high_income_group['row_number'] % 10) == 0,  \
                                            'systematic', 'non-systematic' )
df_high_income_group = df_high_income_group[df_high_income_group['controller'].isin(['systematic'])]

# level 2
df_high_income_group['row_number'] = df_high_income_group.reset_index().index
df_high_income_group['controller'] = np.where( \
                                           (df_high_income_group['row_number'] % 2) == 0,  \
                                            'systematic', 'non-systematic' )
df_high_income_group = df_high_income_group[df_high_income_group['controller'].isin(['systematic'])]

# level 3
df_high_income_group['row_number'] = df_high_income_group.reset_index().index
df_high_income_group['group_tc'] = np.where( \
                                           (df_high_income_group['row_number'] % 2) == 0,  \
                                            'test', 'control' )
df_high_income_group = df_high_income_group.drop(['controller'], axis=1)
df_high_income_group = df_high_income_group.drop(['row_number'], axis=1)
df_high_income_group.head(5)

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance,fe2rr,fe2net,g2n,income_signal,density_signal,group_tc
2616,Whitefield,88618921d3fffff,10108,2598.0,1448,3837,8.186765,21671.0,14457.0,1090114.0,2462,0.039674,4.096818,25.7,14.33,55.74,High Income,High FE,test
828,Chanasandra,88618921c7fffff,4915,1125.0,514,1678,7.238399,9962.0,5670.0,624205.0,1399,0.04087,1.086779,22.89,10.46,45.69,High Income,High FE,control
1049,Electronic City Phase 2,8861892665fffff,5644,1126.0,552,2558,8.943856,13872.0,6175.0,770193.0,1516,0.033054,1.625998,19.95,9.78,49.02,High Income,High FE,test
1795,Mysore rd,8860145a33fffff,3301,663.0,310,780,4.632172,7068.0,3498.0,361820.0,1213,0.042547,0.983851,20.08,9.39,46.76,High Income,High FE,control
1711,Mathikere,8861892c97fffff,4597,930.0,414,1811,9.419203,10498.0,4371.0,540113.0,1407,0.06862,13.226011,20.23,9.01,44.52,High Income,High FE,test


In [33]:
df_high_income_group_summary = df_high_income_group.groupby(['group_tc']).agg({'pickup_hex_8' : 'count',
                                                'net_orders' : 'sum', 
                                                'fe_count' : 'sum',
                                                'gross_orders' : 'sum'
                                                  })
df_high_income_group_summary['g2n'] = df_high_income_group_summary['net_orders']*100.0/df_high_income_group_summary['gross_orders']
df_high_income_group_summary['fe2rr'] = df_high_income_group_summary['gross_orders']*100.0/df_high_income_group_summary['fe_count']
df_high_income_group_summary['fe2net'] = df_high_income_group_summary['net_orders']*100.0/df_high_income_group_summary['fe_count']
df_high_income_group_summary

Unnamed: 0_level_0,pickup_hex_8,net_orders,fe_count,gross_orders,g2n,fe2rr,fe2net
group_tc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
control,12,12005,167314,29967.0,40.060733,17.910635,7.175132
test,13,11272,159772,29755.0,37.882709,18.623413,7.055053


In [34]:
df_low_income_group_summary

Unnamed: 0_level_0,pickup_hex_8,net_orders,fe_count,gross_orders,g2n,fe2rr,fe2net
group_tc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
control,22,12546,188181,32635.0,38.443389,17.342346,6.666986
test,23,12466,193892,31985.0,38.974519,16.496297,6.429352


In [35]:
print(df_high_income_group.shape)
print(df_low_income_group.shape)

(25, 19)
(45, 19)


In [36]:
## Final group

In [37]:
frames = [df_high_income_group, df_low_income_group]
final_grouper = pd.concat(frames)
final_grouper.head(5)

Unnamed: 0,pickup_cluster,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,surge_percentage,discount_amount_fe,discount_gross,sub_total_fe,fe_customer_count,mean_geo_distance,max_geo_distance,fe2rr,fe2net,g2n,income_signal,density_signal,group_tc
2616,Whitefield,88618921d3fffff,10108,2598.0,1448,3837,8.186765,21671.0,14457.0,1090114.0,2462,0.039674,4.096818,25.7,14.33,55.74,High Income,High FE,test
828,Chanasandra,88618921c7fffff,4915,1125.0,514,1678,7.238399,9962.0,5670.0,624205.0,1399,0.04087,1.086779,22.89,10.46,45.69,High Income,High FE,control
1049,Electronic City Phase 2,8861892665fffff,5644,1126.0,552,2558,8.943856,13872.0,6175.0,770193.0,1516,0.033054,1.625998,19.95,9.78,49.02,High Income,High FE,test
1795,Mysore rd,8860145a33fffff,3301,663.0,310,780,4.632172,7068.0,3498.0,361820.0,1213,0.042547,0.983851,20.08,9.39,46.76,High Income,High FE,control
1711,Mathikere,8861892c97fffff,4597,930.0,414,1811,9.419203,10498.0,4371.0,540113.0,1407,0.06862,13.226011,20.23,9.01,44.52,High Income,High FE,test


In [38]:
final_grouper['affluence_tag'] = np.where(final_grouper['income_signal'].isin(['High Income']), \
                                     'High Affluence', 'Low Affluence')
final_grouper = final_grouper.drop(['density_signal'], axis=1)

In [39]:
final_grouper_summary = final_grouper.groupby(['affluence_tag', 'group_tc']).agg({'pickup_hex_8' : 'count', \
                                                          'fe_count' : 'sum', \
                                                          'gross_orders' : 'sum', \
                                                          'net_orders' : 'sum', \
                                                          'surged_fe' : 'sum', \
                                                          'fe_customer_count' : 'sum'
                                                         })
final_grouper_summary['fe2rr'] = final_grouper_summary['gross_orders']*100.0/final_grouper_summary['fe_count']
final_grouper_summary['g2n'] = final_grouper_summary['net_orders']*100.0/final_grouper_summary['gross_orders']
final_grouper_summary['fe2net'] = final_grouper_summary['net_orders']*100.0/final_grouper_summary['fe_count']
final_grouper_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,pickup_hex_8,fe_count,gross_orders,net_orders,surged_fe,fe_customer_count,fe2rr,g2n,fe2net
affluence_tag,group_tc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
High Affluence,control,12,167314,29967.0,12005,73982,49060,17.910635,40.060733,7.175132
High Affluence,test,13,159772,29755.0,11272,68565,40879,18.623413,37.882709,7.055053
Low Affluence,control,22,188181,32635.0,12546,75303,58673,17.342346,38.443389,6.666986
Low Affluence,test,23,193892,31985.0,12466,81135,64661,16.496297,38.974519,6.429352


In [43]:
final_grouper.to_csv('/Users/rapido/local-datasets/affluence/experiment/exp_affluence_bangalore_link_circuit_break_hex_8_list_v1.csv', index = False)