# Objective

To create the segments based on demand conversion(FE2RR) and fulfilment (G2N) as 4 quardrants:
1. Low-High
2. High-High
3. High-Low
4. Low-L

# Import Packages

In [64]:
import pandas as pd
import glob
import numpy as np

import math
from datetime import datetime

from pyhive import presto

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go

import statsmodels.api as sm

import statsmodels.formula.api as smf

# Configs

In [2]:
city = 'Chennai'
service = 'Link'
start_date = '20230710'
end_date = '20230723'
PATH_TO_ROOT_DIR = '/Users/pallavichandra/commit_repo/lchf_super_clusters/dse-demand-analysis/'

# Connection 

In [3]:
# conn = presto.connect(host='prime-trino.serving.data.production.internal',
#                       port=80,
#                       username='pallavi.chandra@rapido.bike')

# conn = presto.connect(host='presto.yoda.run',
#                       port=8080,
#                       username='pallavi.chandra@rapido.bike')


conn = presto.connect(
    host='presto-gateway.serving.data.plectrum.dev',
    port='443',
    protocol='https',
    catalog='hive',
    username='pallavi.chandra@rapido.bike'
)

# SQL Queries

In [4]:
fe_query = '''with cluster_hex_map as (  
    select
        cluster,
        hex_id,
        executiondate
    from 
            datasets.city_cluster_hex
        where
            city = '{city}'
            and 
                resolution = 8
            
    ),

fe_tbl as (
    select 
        fe.city,
        fe.yyyymmdd,
        date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
        cluster as pickup_cluster,
        quarter_hour,
        count(distinct fare_estimate_id) AS fe_count
    from 
        hive.pricing.fare_estimates_enriched fe
    left join 
        datasets.city_cluster_hex cls
        on 
            fe.pickup_location_hex_8 = cls.hex_id
    where
        yyyymmdd BETWEEN '{start_date}' AND '{end_date}'
        and 
            fe.city = '{city}'
        and 
            service_name IN ('{service_name}')
        and 
            api_context = '/fare/estimate' 
    group by
        1,2,3,4,5
),

rr_tbl as (
    SELECT   
        city_name as city,
        ols.yyyymmdd,
        date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
        cluster AS pickup_cluster,
        quarter_hour,
        COUNT(distinct order_id) AS rr_count,
        COUNT(DISTINCT CASE WHEN order_status = 'dropped' THEN order_id END) AS net_count
    FROM 
        orders.order_logs_snapshot ols
    LEFT JOIN 
        datasets.city_cluster_hex cls
        ON 
            ols.pickup_location_hex_8 = cls.hex_id
    WHERE
        yyyymmdd BETWEEN '{start_date}' AND '{end_date}'
        AND service_obj_service_name IN ('{service_name}')
        AND city_name = '{city}'

    GROUP BY 1,2,3,4,5
),

fe_merged AS (
    SELECT
        fe_tbl.city,
        fe_tbl.yyyymmdd,
        fe_tbl.orderdate,
        fe_tbl.pickup_cluster,
        fe_tbl.quarter_hour,
        fe_count,
        COALESCE(rr_tbl.rr_count,0) AS rr_count,
        COALESCE(rr_tbl.net_count,0) AS net_count
    FROM
        fe_tbl
    LEFT JOIN
        rr_tbl
        ON fe_tbl.city = rr_tbl.city
        AND fe_tbl.orderdate = rr_tbl.orderdate
        AND fe_tbl.pickup_cluster = rr_tbl.pickup_cluster
        AND fe_tbl.quarter_hour = rr_tbl.quarter_hour
),
    
order_stat AS (
    SELECT
        city,
        yyyymmdd,
        orderdate,
        pickup_cluster,
        quarter_hour, 
        SUM(fe_count) AS fe_count,
        SUM(rr_count) AS rr_count,
        SUM(net_count) AS net_count,
        100.0*SUM(rr_count)/SUM(fe_count) AS FE2RR,
        100.0*SUM(net_count)/SUM(rr_count) AS G2N,
        100.0*SUM(net_count)/SUM(fe_count) AS FE2Net
        -- ROUND(SUM(CAST(rr_count AS DOUBLE))/SUM(NULLIF(fe_count,0)),4) as FE_RR_nullif,
        -- ROUND(SUM(CAST(net_count AS DOUBLE))/SUM(NULLIF(rr_count,0)),4) as G2N_nullif,
        -- ROUND(SUM(CAST(net_count AS DOUBLE))/SUM(NULLIF(fe_count,0)),4) as FE2Net_nullif
        
    FROM 
        fe_merged 
    where fe_count > 0 
    and rr_count > 0 
    GROUP BY 
        1,2,3,4,5
)

SELECT * FROM order_stat ORDER BY 1,2,3,4
 
'''.format(
    start_date = start_date,
    end_date = end_date, 
    service_name = service,
    city = city)

In [15]:
print(fe_query)

with cluster_hex_map as (  
    select
        cluster,
        hex_id,
        executiondate
    from 
            datasets.city_cluster_hex
        where
            city = 'Chennai'
            and 
                resolution = 8
            
    ),

fe_tbl as (
    select 
        fe.city,
        fe.yyyymmdd,
        date_format(from_unixtime(epoch / 1000, 'Asia/Kolkata'), '%Y-%m-%d') AS orderdate,
        cluster as pickup_cluster,
        quarter_hour,
        count(distinct fare_estimate_id) AS fe_count
    from 
        hive.pricing.fare_estimates_enriched fe
    left join 
        datasets.city_cluster_hex cls
        on 
            fe.pickup_location_hex_8 = cls.hex_id
    where
        yyyymmdd BETWEEN '20230710' AND '20230723'
        and 
            fe.city = 'Chennai'
        and 
            service_name IN ('Link')
        and 
            api_context = '/fare/estimate' 
    group by
        1,2,3,4,5
),

rr_tbl as (
    SELECT   
        city_name as city,
        o

# Read SQL Queries

In [16]:
#rr_data = pd.read_sql(rr_query, conn)

#rr_data = pd.read_csv('~/Downloads/rr_Hyd_20220905_20221128.csv')

fe_data = pd.read_sql(fe_query, conn)

fe_data

Unnamed: 0,city,yyyymmdd,orderdate,pickup_cluster,quarter_hour,fe_count,rr_count,net_count,FE2RR,G2N,FE2Net
0,Chennai,20230710,2023-07-10,,1445,8,4,1,50.0,25.0,12.5
1,Chennai,20230710,2023-07-10,,1700,11,2,0,18.2,0.0,0.0
2,Chennai,20230710,2023-07-10,,1030,3,1,1,33.3,100.0,33.3
3,Chennai,20230710,2023-07-10,,1015,7,2,0,28.6,0.0,0.0
4,Chennai,20230710,2023-07-10,,0830,8,3,2,37.5,66.7,25.0
...,...,...,...,...,...,...,...,...,...,...,...
162543,Chennai,20230723,2023-07-23,puzhal,1930,2,1,1,50.0,100.0,50.0
162544,Chennai,20230723,2023-07-23,puzhal,1415,5,1,0,20.0,0.0,0.0
162545,Chennai,20230723,2023-07-23,puzhal,1845,3,1,0,33.3,0.0,0.0
162546,Chennai,20230723,2023-07-23,puzhal,2215,5,1,0,20.0,0.0,0.0


In [17]:
fe_data.to_csv('~/Downloads/fe_data_Chn_non_peak_analysis.csv', index = False)

In [5]:
fe_data = pd.read_csv('~/Downloads/fe_data_Chn_non_peak_analysis.csv')
fe_data.head()

Unnamed: 0,city,yyyymmdd,orderdate,pickup_cluster,quarter_hour,fe_count,rr_count,net_count,FE2RR,G2N,FE2Net
0,Chennai,20230710,2023-07-10,,1445,8,4,1,50.0,25.0,12.5
1,Chennai,20230710,2023-07-10,,1700,11,2,0,18.2,0.0,0.0
2,Chennai,20230710,2023-07-10,,1030,3,1,1,33.3,100.0,33.3
3,Chennai,20230710,2023-07-10,,1015,7,2,0,28.6,0.0,0.0
4,Chennai,20230710,2023-07-10,,830,8,3,2,37.5,66.7,25.0


## Get time period

In [6]:
def get_period(hour):
    hour = int(hour)
    if (hour >= 8 and hour <= 11):
        return "morning_peak"
    elif (hour >= 17 and hour <= 21):
        return "evening_peak"
    elif (hour > 11 and hour < 17):
        return "afternoon"
    else:
        return "rest"

In [7]:
fe_data['quarter_hour'] = fe_data['quarter_hour'].apply(lambda x: str(x).zfill(4))

fe_data['time_period'] = fe_data['quarter_hour'].apply(
    lambda x: get_period(x[0:2])
)

In [8]:
fe_data['week_period'] = fe_data['yyyymmdd'].map(lambda x: datetime.strptime(str(x), '%Y%m%d').strftime('%A')
)

fe_data

Unnamed: 0,city,yyyymmdd,orderdate,pickup_cluster,quarter_hour,fe_count,rr_count,net_count,FE2RR,G2N,FE2Net,time_period,week_period
0,Chennai,20230710,2023-07-10,,1445,8,4,1,50.0,25.0,12.5,afternoon,Monday
1,Chennai,20230710,2023-07-10,,1700,11,2,0,18.2,0.0,0.0,evening_peak,Monday
2,Chennai,20230710,2023-07-10,,1030,3,1,1,33.3,100.0,33.3,morning_peak,Monday
3,Chennai,20230710,2023-07-10,,1015,7,2,0,28.6,0.0,0.0,morning_peak,Monday
4,Chennai,20230710,2023-07-10,,0830,8,3,2,37.5,66.7,25.0,morning_peak,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162543,Chennai,20230723,2023-07-23,puzhal,1930,2,1,1,50.0,100.0,50.0,evening_peak,Sunday
162544,Chennai,20230723,2023-07-23,puzhal,1415,5,1,0,20.0,0.0,0.0,afternoon,Sunday
162545,Chennai,20230723,2023-07-23,puzhal,1845,3,1,0,33.3,0.0,0.0,evening_peak,Sunday
162546,Chennai,20230723,2023-07-23,puzhal,2215,5,1,0,20.0,0.0,0.0,rest,Sunday


In [9]:
fe_data_filtered = fe_data[fe_data['time_period'] != 'rest']

In [10]:
fe_data_filtered['yyyymmdd'] = fe_data_filtered['yyyymmdd'].astype(str)
fe_data_filtered['quarter_hour'] = fe_data_filtered['quarter_hour'].astype(str)
fe_data_filtered['pickup_cluster'] = fe_data_filtered['pickup_cluster'].astype(str)

fe_data_filtered = fe_data_filtered.fillna(0)
fe_data_filtered = fe_data_filtered.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
fe_data.head()

Unnamed: 0,city,yyyymmdd,orderdate,pickup_cluster,quarter_hour,fe_count,rr_count,net_count,FE2RR,G2N,FE2Net,time_period,week_period
0,Chennai,20230710,2023-07-10,,1445,8,4,1,50.0,25.0,12.5,afternoon,Monday
1,Chennai,20230710,2023-07-10,,1700,11,2,0,18.2,0.0,0.0,evening_peak,Monday
2,Chennai,20230710,2023-07-10,,1030,3,1,1,33.3,100.0,33.3,morning_peak,Monday
3,Chennai,20230710,2023-07-10,,1015,7,2,0,28.6,0.0,0.0,morning_peak,Monday
4,Chennai,20230710,2023-07-10,,830,8,3,2,37.5,66.7,25.0,morning_peak,Monday


In [12]:
fe_data['quarter_hour'] = fe_data['quarter_hour'].apply(lambda x: str(x).zfill(4))

fe_data['quarter_hour'] = fe_data['quarter_hour'].astype(str)

fe_data['hour'] = fe_data['quarter_hour'].str[0:2]

In [14]:
cluster_temporal_stats = fe_data.groupby(
        ['pickup_cluster', 'hour', 'week_period']).agg(
        total_fe_count=pd.NamedAgg('fe_count', 'sum'),
        total_rr_count=pd.NamedAgg('rr_count', 'sum'),
        total_net_orders=pd.NamedAgg('net_count', 'sum'),
    ).reset_index().sort_values('hour')

In [16]:
cluster_temporal_stats['fe_rr_ratio'] = (
        100 * cluster_temporal_stats['total_rr_count'] / (cluster_temporal_stats['total_fe_count'] + 1)
)

cluster_temporal_stats['g2n_ratio'] = (
        100 * cluster_temporal_stats['total_net_orders'] / (cluster_temporal_stats['total_rr_count'] + 1)
)

In [17]:
cluster_temporal_stats.head()

Unnamed: 0,pickup_cluster,hour,week_period,total_fe_count,total_rr_count,total_net_orders,fe_rr_ratio,g2n_ratio
0,Adambakkam,0,Friday,19,10,5,50.0,45.454545
23058,Vadapalani Thirunagar,0,Monday,46,15,9,31.914894,56.25
23059,Vadapalani Thirunagar,0,Saturday,63,25,18,39.0625,69.230769
23060,Vadapalani Thirunagar,0,Sunday,82,22,16,26.506024,69.565217
23061,Vadapalani Thirunagar,0,Thursday,42,21,14,48.837209,63.636364


In [18]:
fe2rr_below_25_percentile = cluster_temporal_stats[
        cluster_temporal_stats['fe_rr_ratio'] < cluster_temporal_stats['fe_rr_ratio'].quantile(0.25)]


In [21]:
cluster_temporal_stats = fe_data.groupby(
        ['pickup_cluster', 'time_period', 'week_period']).agg(
        total_fe_count=pd.NamedAgg('fe_count', 'sum'),
        total_rr_count=pd.NamedAgg('rr_count', 'sum'),
        total_net_orders=pd.NamedAgg('net_count', 'sum'),
    ).reset_index().sort_values('pickup_cluster')


cluster_temporal_stats.head()

Unnamed: 0,pickup_cluster,time_period,week_period,total_fe_count,total_rr_count,total_net_orders
0,Adambakkam,afternoon,Friday,1204,290,194
27,Adambakkam,rest,Wednesday,468,171,124
26,Adambakkam,rest,Tuesday,501,183,116
25,Adambakkam,rest,Thursday,698,238,111
24,Adambakkam,rest,Sunday,450,129,93


In [22]:
cluster_temporal_stats['fe_rr_ratio'] = (
        100 * cluster_temporal_stats['total_rr_count'] / (cluster_temporal_stats['total_fe_count'] + 1)
)

cluster_temporal_stats['g2n_ratio'] = (
        100 * cluster_temporal_stats['total_net_orders'] / (cluster_temporal_stats['total_rr_count'] + 1)
)

fe2rr_below_25_percentile = cluster_temporal_stats[
        cluster_temporal_stats['fe_rr_ratio'] < cluster_temporal_stats['fe_rr_ratio'].quantile(0.25)]


In [27]:
lower_quantile_threshold_list = [0.25, 0.20, 0.15, 0.12, 0.1]
higher_quantile_threshold_list = [0.75, 0.80, 0.85, 0.87, 0.9]
temporal_stats_for_threshold = pd.DataFrame({})

In [130]:
## check 
temporal_stats_for_each_threshold = cluster_temporal_stats.groupby(
        ['pickup_cluster', 'time_period']).agg(
        lower_g2n_threshold=pd.NamedAgg('g2n_ratio', lambda x: x.quantile(0.25)),
        higher_g2n_threshold=pd.NamedAgg('g2n_ratio', lambda x: x.quantile(0.75)),
        lower_fe2rr_threshold=pd.NamedAgg('fe_rr_ratio', lambda x: x.quantile(0.20)),
        higher_fe2rr_threshold=pd.NamedAgg('fe_rr_ratio', lambda x: x.quantile(0.80))
    ).reset_index()
    
temporal_stats_for_each_threshold.head()

Unnamed: 0,pickup_cluster,time_period,lower_g2n_threshold,higher_g2n_threshold,lower_fe2rr_threshold,higher_fe2rr_threshold
0,Adambakkam,afternoon,70.314526,74.067164,19.517234,23.884211
1,Adambakkam,evening_peak,73.437118,75.843394,18.132159,26.583157
2,Adambakkam,morning_peak,39.799506,58.195963,21.956587,23.416348
3,Adambakkam,rest,52.823747,68.69606,31.848441,36.043616
4,Adyar,afternoon,66.763127,69.43514,18.87414,22.839259


In [93]:
temporal_stats_for_threshold = pd.DataFrame()

for i in range(0, len(lower_quantile_threshold_list), 1):
    
    lower_g2n_threshold_str = 'lower_g2n_threshold_' + str(lower_quantile_threshold_list[i])
    higher_g2n_threshold_str = 'higher_g2n_threshold_' + str(higher_quantile_threshold_list[i])
    lower_fe2rr_threshold_str = 'lower_fe2rr_threshold_' + str(lower_quantile_threshold_list[i])
    higher_fe2rr_threshold_str = 'higher_fe2rr_threshold_' + str(higher_quantile_threshold_list[i])
    print(lower_quantile_threshold_list[i])

    temporal_stats_for_each_threshold = cluster_temporal_stats.groupby(
        ['pickup_cluster', 'time_period']).agg(
        lower_g2n_threshold=pd.NamedAgg('g2n_ratio', lambda x: x.quantile(lower_quantile_threshold_list[i])),
        higher_g2n_threshold=pd.NamedAgg('g2n_ratio', lambda x: x.quantile(higher_quantile_threshold_list[i])),
        lower_fe2rr_threshold=pd.NamedAgg('fe_rr_ratio', lambda x: x.quantile(lower_quantile_threshold_list[i])),
        higher_fe2rr_threshold=pd.NamedAgg('fe_rr_ratio', lambda x: x.quantile(higher_quantile_threshold_list[i]))
    ).reset_index()

    temporal_stats_for_each_threshold.rename(
        columns={'lower_g2n_threshold': lower_g2n_threshold_str, 
                 'higher_g2n_threshold': higher_g2n_threshold_str,
                 'lower_fe2rr_threshold': lower_fe2rr_threshold_str,
                 'higher_fe2rr_threshold': higher_fe2rr_threshold_str}, 
        inplace=True)
    
    #print(temporal_stats_for_each_threshold.head())
    
    #temporal_stats_for_threshold = pd.concat([temporal_stats_for_threshold, temporal_stats_for_each_threshold], axis=1)
    
    temporal_stats_for_each_threshold.to_csv(
        '/Users/pallavichandra/commit_repo/latest_demand_repo/dse-demand-analysis/experiments/price_sensitivity/traffic_surge/data/non_peak_demand_thresholds/temporal_stats_for_each_threshold_'+str(
        lower_quantile_threshold_list[i])+'_'+str(higher_quantile_threshold_list[i])+'.csv', index=False)
    
filepath = "/Users/pallavichandra/commit_repo/latest_demand_repo/dse-demand-analysis/experiments/price_sensitivity/traffic_surge/data/non_peak_demand_thresholds"
filenames = glob.glob(filepath + "/*.csv")


li = []
for filename in filenames:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
    print(li)

frame = pd.concat(li, axis=1)

        


0.25
0.2
0.15
0.12
0.1
[    pickup_cluster   time_period  lower_g2n_threshold_0.1  \
0       Adambakkam     afternoon                68.109966   
1       Adambakkam  evening_peak                72.423712   
2       Adambakkam  morning_peak                35.108549   
3       Adambakkam          rest                46.120077   
4            Adyar     afternoon                65.765439   
..             ...           ...                      ...   
631        kilpauk          rest                53.128312   
632         puzhal     afternoon                27.513228   
633         puzhal  evening_peak                30.000000   
634         puzhal  morning_peak                30.750000   
635         puzhal          rest                 4.285714   

     higher_g2n_threshold_0.9  lower_fe2rr_threshold_0.1  \
0                   75.055004                  17.567823   
1                   76.467444                  15.736812   
2                   73.684280                  19.495067   
3  

In [94]:
frame.describe()

Unnamed: 0,lower_g2n_threshold_0.1,higher_g2n_threshold_0.9,lower_fe2rr_threshold_0.1,higher_fe2rr_threshold_0.9,lower_g2n_threshold_0.12,higher_g2n_threshold_0.87,lower_fe2rr_threshold_0.12,higher_fe2rr_threshold_0.87,lower_g2n_threshold_0.2,higher_g2n_threshold_0.8,lower_fe2rr_threshold_0.2,higher_fe2rr_threshold_0.8,lower_g2n_threshold_0.15,higher_g2n_threshold_0.85,lower_fe2rr_threshold_0.15,higher_fe2rr_threshold_0.85,lower_g2n_threshold_0.25,higher_g2n_threshold_0.75,lower_fe2rr_threshold_0.25,higher_fe2rr_threshold_0.75
count,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0,636.0
mean,49.645598,66.688813,21.447067,30.477133,50.327987,65.998862,21.833826,30.097639,52.574106,64.278086,23.269323,29.343415,51.351571,65.538895,22.413966,29.844643,53.559649,62.984227,24.061964,28.914042
std,16.057502,12.869375,5.694221,8.618626,16.044767,13.07359,5.752193,8.350981,16.003882,13.391482,5.868917,7.824805,16.077489,13.231485,5.874695,8.185566,15.850308,13.453417,5.77767,7.521809
min,0.0,0.0,6.962242,18.210217,0.0,0.0,7.124085,17.850913,0.0,0.0,8.836915,17.347703,0.0,0.0,7.366851,17.611378,0.0,0.0,10.839707,17.251609
25%,40.945942,63.765656,17.123214,24.666127,41.893388,62.963659,17.481982,24.39676,44.432275,61.645768,18.940736,23.923435,43.130835,62.695897,17.953533,24.202038,45.814786,59.978678,19.834389,23.753319
50%,53.163677,70.135631,20.439333,28.640596,54.025032,69.462315,20.754575,28.329545,56.628546,67.7163,22.34202,27.754501,55.600352,69.133661,21.435671,28.104497,57.94978,66.284719,23.03554,27.484405
75%,61.755674,74.703645,25.19139,33.768744,62.176908,74.220028,25.585121,33.36517,63.74681,72.548457,27.071023,32.606305,62.971191,73.852063,26.230161,33.144764,64.70321,71.781584,27.516494,32.235779
max,75.47619,86.646181,45.0,79.166667,76.404762,86.610623,46.0,77.916667,78.779473,86.14262,50.0,75.0,77.797619,86.586918,47.5,77.083333,79.09154,85.505804,50.0,71.25


In [95]:


frame.head(100)

Unnamed: 0,pickup_cluster,time_period,lower_g2n_threshold_0.1,higher_g2n_threshold_0.9,lower_fe2rr_threshold_0.1,higher_fe2rr_threshold_0.9,pickup_cluster.1,time_period.1,lower_g2n_threshold_0.12,higher_g2n_threshold_0.87,...,lower_g2n_threshold_0.15,higher_g2n_threshold_0.85,lower_fe2rr_threshold_0.15,higher_fe2rr_threshold_0.85,pickup_cluster.2,time_period.2,lower_g2n_threshold_0.25,higher_g2n_threshold_0.75,lower_fe2rr_threshold_0.25,higher_fe2rr_threshold_0.75
0,Adambakkam,afternoon,68.109966,75.055004,17.567823,24.026556,Adambakkam,afternoon,68.398625,74.862342,...,68.831615,74.733900,18.328898,24.006639,Adambakkam,afternoon,70.314526,74.067164,20.919202,23.710526
1,Adambakkam,evening_peak,72.423712,76.467444,15.736812,27.489526,Adambakkam,evening_peak,72.506122,76.428953,...,72.629738,76.403293,16.374773,27.018974,Adambakkam,evening_peak,73.437118,75.843394,20.449256,26.164707
2,Adambakkam,morning_peak,35.108549,73.684280,19.495067,24.828063,Adambakkam,morning_peak,35.228851,72.053300,...,35.409303,70.965980,21.276051,23.779719,Adambakkam,morning_peak,39.799506,58.195963,22.086899,23.395463
3,Adambakkam,rest,46.120077,71.760286,30.519143,36.456732,Adambakkam,rest,46.217108,71.660465,...,46.362655,71.593918,31.477163,36.454820,Adambakkam,rest,52.823747,68.696060,31.926348,35.427766
4,Adyar,afternoon,65.765439,72.209962,17.846346,23.280761,Adyar,afternoon,65.954241,71.578833,...,66.237445,71.158081,18.199988,23.093946,Adyar,afternoon,66.763127,69.435140,19.708547,22.550636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Beasant Avenue,rest,23.968719,55.000000,17.082015,30.331183,Beasant Avenue,rest,24.520039,52.750000,...,25.347019,51.250000,19.504879,30.163441,Beasant Avenue,rest,29.884358,46.739130,21.606632,28.149002
96,Beasant Nagar,afternoon,65.417237,74.423780,19.188742,24.468824,Beasant Nagar,afternoon,65.559508,74.214329,...,65.772914,74.074695,20.753311,24.348773,Beasant Nagar,afternoon,66.205648,72.620192,22.482281,24.297235
97,Beasant Nagar,evening_peak,57.689511,65.478581,13.024812,25.418574,Beasant Nagar,evening_peak,58.574147,64.746729,...,59.901101,64.258827,13.770222,24.853901,Beasant Nagar,evening_peak,60.799629,63.773255,16.453072,24.663919
98,Beasant Nagar,morning_peak,52.201273,73.212970,25.135690,30.254860,Beasant Nagar,morning_peak,52.584057,72.549742,...,53.158232,72.107590,25.282816,30.205224,Beasant Nagar,morning_peak,54.733401,66.617488,25.432669,29.657077


In [98]:
frame.columns

Index(['pickup_cluster', 'time_period', 'lower_g2n_threshold_0.1',
       'higher_g2n_threshold_0.9', 'lower_fe2rr_threshold_0.1',
       'higher_fe2rr_threshold_0.9', 'pickup_cluster', 'time_period',
       'lower_g2n_threshold_0.12', 'higher_g2n_threshold_0.87',
       'lower_fe2rr_threshold_0.12', 'higher_fe2rr_threshold_0.87',
       'pickup_cluster', 'time_period', 'lower_g2n_threshold_0.2',
       'higher_g2n_threshold_0.8', 'lower_fe2rr_threshold_0.2',
       'higher_fe2rr_threshold_0.8', 'pickup_cluster', 'time_period',
       'lower_g2n_threshold_0.15', 'higher_g2n_threshold_0.85',
       'lower_fe2rr_threshold_0.15', 'higher_fe2rr_threshold_0.85',
       'pickup_cluster', 'time_period', 'lower_g2n_threshold_0.25',
       'higher_g2n_threshold_0.75', 'lower_fe2rr_threshold_0.25',
       'higher_fe2rr_threshold_0.75'],
      dtype='object')

In [117]:
temporal_stats_for_threshold = frame.copy()

temporal_stats_for_threshold_filtered = temporal_stats_for_threshold[[
    'pickup_cluster','time_period','lower_g2n_threshold_0.1','higher_g2n_threshold_0.9',
    'lower_fe2rr_threshold_0.1','higher_fe2rr_threshold_0.9',
    'lower_g2n_threshold_0.12','higher_g2n_threshold_0.87','lower_fe2rr_threshold_0.12',
    'higher_fe2rr_threshold_0.87','lower_g2n_threshold_0.2','higher_g2n_threshold_0.8',
    'lower_fe2rr_threshold_0.2','higher_fe2rr_threshold_0.8','lower_g2n_threshold_0.15',
    'higher_g2n_threshold_0.85','lower_fe2rr_threshold_0.15','higher_fe2rr_threshold_0.85',
    'lower_g2n_threshold_0.25','higher_g2n_threshold_0.75','lower_fe2rr_threshold_0.25',
    'higher_fe2rr_threshold_0.75']]

temporal_stats_for_threshold_filtered.head()

Unnamed: 0,pickup_cluster,pickup_cluster.1,pickup_cluster.2,pickup_cluster.3,pickup_cluster.4,time_period,time_period.1,time_period.2,time_period.3,time_period.4,...,lower_fe2rr_threshold_0.2,higher_fe2rr_threshold_0.8,lower_g2n_threshold_0.15,higher_g2n_threshold_0.85,lower_fe2rr_threshold_0.15,higher_fe2rr_threshold_0.85,lower_g2n_threshold_0.25,higher_g2n_threshold_0.75,lower_fe2rr_threshold_0.25,higher_fe2rr_threshold_0.75
0,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,afternoon,afternoon,afternoon,afternoon,afternoon,...,19.517234,23.884211,68.831615,74.7339,18.328898,24.006639,70.314526,74.067164,20.919202,23.710526
1,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,evening_peak,evening_peak,evening_peak,evening_peak,evening_peak,...,18.132159,26.583157,72.629738,76.403293,16.374773,27.018974,73.437118,75.843394,20.449256,26.164707
2,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,morning_peak,morning_peak,morning_peak,morning_peak,morning_peak,...,21.956587,23.416348,35.409303,70.96598,21.276051,23.779719,39.799506,58.195963,22.086899,23.395463
3,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,rest,rest,rest,rest,rest,...,31.848441,36.043616,46.362655,71.593918,31.477163,36.45482,52.823747,68.69606,31.926348,35.427766
4,Adyar,Adyar,Adyar,Adyar,Adyar,afternoon,afternoon,afternoon,afternoon,afternoon,...,18.87414,22.839259,66.237445,71.158081,18.199988,23.093946,66.763127,69.43514,19.708547,22.550636


In [118]:
temporal_stats_for_threshold_filtered.head()

Unnamed: 0,pickup_cluster,pickup_cluster.1,pickup_cluster.2,pickup_cluster.3,pickup_cluster.4,time_period,time_period.1,time_period.2,time_period.3,time_period.4,...,lower_fe2rr_threshold_0.2,higher_fe2rr_threshold_0.8,lower_g2n_threshold_0.15,higher_g2n_threshold_0.85,lower_fe2rr_threshold_0.15,higher_fe2rr_threshold_0.85,lower_g2n_threshold_0.25,higher_g2n_threshold_0.75,lower_fe2rr_threshold_0.25,higher_fe2rr_threshold_0.75
0,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,afternoon,afternoon,afternoon,afternoon,afternoon,...,19.517234,23.884211,68.831615,74.7339,18.328898,24.006639,70.314526,74.067164,20.919202,23.710526
1,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,evening_peak,evening_peak,evening_peak,evening_peak,evening_peak,...,18.132159,26.583157,72.629738,76.403293,16.374773,27.018974,73.437118,75.843394,20.449256,26.164707
2,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,morning_peak,morning_peak,morning_peak,morning_peak,morning_peak,...,21.956587,23.416348,35.409303,70.96598,21.276051,23.779719,39.799506,58.195963,22.086899,23.395463
3,Adambakkam,Adambakkam,Adambakkam,Adambakkam,Adambakkam,rest,rest,rest,rest,rest,...,31.848441,36.043616,46.362655,71.593918,31.477163,36.45482,52.823747,68.69606,31.926348,35.427766
4,Adyar,Adyar,Adyar,Adyar,Adyar,afternoon,afternoon,afternoon,afternoon,afternoon,...,18.87414,22.839259,66.237445,71.158081,18.199988,23.093946,66.763127,69.43514,19.708547,22.550636


In [119]:
# drop duplicate column names
temporal_stats_for_threshold_filtered2 = temporal_stats_for_threshold_filtered.T.drop_duplicates().T

temporal_stats_for_threshold_filtered2.head()

Unnamed: 0,pickup_cluster,time_period,lower_g2n_threshold_0.1,higher_g2n_threshold_0.9,lower_fe2rr_threshold_0.1,higher_fe2rr_threshold_0.9,lower_g2n_threshold_0.12,higher_g2n_threshold_0.87,lower_fe2rr_threshold_0.12,higher_fe2rr_threshold_0.87,...,lower_fe2rr_threshold_0.2,higher_fe2rr_threshold_0.8,lower_g2n_threshold_0.15,higher_g2n_threshold_0.85,lower_fe2rr_threshold_0.15,higher_fe2rr_threshold_0.85,lower_g2n_threshold_0.25,higher_g2n_threshold_0.75,lower_fe2rr_threshold_0.25,higher_fe2rr_threshold_0.75
0,Adambakkam,afternoon,68.11,75.055,17.5678,24.0266,68.3986,74.8623,17.8723,24.0146,...,19.5172,23.8842,68.8316,74.7339,18.3289,24.0066,70.3145,74.0672,20.9192,23.7105
1,Adambakkam,evening_peak,72.4237,76.4674,15.7368,27.4895,72.5061,76.429,15.992,27.2072,...,18.1322,26.5832,72.6297,76.4033,16.3748,27.019,73.4371,75.8434,20.4493,26.1647
2,Adambakkam,morning_peak,35.1085,73.6843,19.4951,24.8281,35.2289,72.0533,20.2075,24.1991,...,21.9566,23.4163,35.4093,70.966,21.2761,23.7797,39.7995,58.196,22.0869,23.3955
3,Adambakkam,rest,46.1201,71.7603,30.5191,36.4567,46.2171,71.6605,30.9024,36.4556,...,31.8484,36.0436,46.3627,71.5939,31.4772,36.4548,52.8237,68.6961,31.9263,35.4278
4,Adyar,afternoon,65.7654,72.21,17.8463,23.2808,65.9542,71.5788,17.9878,23.1687,...,18.8741,22.8393,66.2374,71.1581,18.2,23.0939,66.7631,69.4351,19.7085,22.5506


In [120]:
cluster_temporal_stats_with_threshold = pd.merge(
    cluster_temporal_stats, temporal_stats_for_threshold_filtered2, left_on = [
    'pickup_cluster', 'time_period'], right_on = [
    'pickup_cluster', 'time_period'], how = 'left')

cluster_temporal_stats_with_threshold.head()

Unnamed: 0,pickup_cluster,time_period,week_period,total_fe_count,total_rr_count,total_net_orders,fe_rr_ratio,g2n_ratio,lower_g2n_threshold_0.1,higher_g2n_threshold_0.9,...,lower_fe2rr_threshold_0.2,higher_fe2rr_threshold_0.8,lower_g2n_threshold_0.15,higher_g2n_threshold_0.85,lower_fe2rr_threshold_0.15,higher_fe2rr_threshold_0.85,lower_g2n_threshold_0.25,higher_g2n_threshold_0.75,lower_fe2rr_threshold_0.25,higher_fe2rr_threshold_0.75
0,Adambakkam,afternoon,Friday,1204,290,194,24.06639,66.666667,68.11,75.055,...,19.5172,23.8842,68.8316,74.7339,18.3289,24.0066,70.3145,74.0672,20.9192,23.7105
1,Adambakkam,rest,Wednesday,468,171,124,36.460554,72.093023,46.1201,71.7603,...,31.8484,36.0436,46.3627,71.5939,31.4772,36.4548,52.8237,68.6961,31.9263,35.4278
2,Adambakkam,rest,Tuesday,501,183,116,36.454183,63.043478,46.1201,71.7603,...,31.8484,36.0436,46.3627,71.5939,31.4772,36.4548,52.8237,68.6961,31.9263,35.4278
3,Adambakkam,rest,Thursday,698,238,111,34.048641,46.443515,46.1201,71.7603,...,31.8484,36.0436,46.3627,71.5939,31.4772,36.4548,52.8237,68.6961,31.9263,35.4278
4,Adambakkam,rest,Sunday,450,129,93,28.603104,71.538462,46.1201,71.7603,...,31.8484,36.0436,46.3627,71.5939,31.4772,36.4548,52.8237,68.6961,31.9263,35.4278


In [121]:
cluster_temporal_stats_with_threshold.shape, cluster_temporal_stats.shape

((4432, 28), (4432, 8))

## Conclusion

We have list of thresholds that are at cluster, week_period, time_period level. 

In [133]:
lower_quantile_threshold_list, higher_quantile_threshold_list

([0.25, 0.2, 0.15, 0.12, 0.1], [0.75, 0.8, 0.85, 0.87, 0.9])

In [122]:
    for i in range(0, len(lower_quantile_threshold_list), 1):
        conversion_category_str = 'conversion_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
            higher_quantile_threshold_list[i])
        fulfilment_category_str = 'fulfil_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
            higher_quantile_threshold_list[i])

        cluster_temporal_stats_with_threshold.loc[
            cluster_temporal_stats_with_threshold['fe_rr_ratio'] < cluster_temporal_stats_with_threshold[
                'lower_fe2rr_threshold_' + str(lower_quantile_threshold_list[i])], conversion_category_str] = 'low'

        cluster_temporal_stats_with_threshold.loc[(cluster_temporal_stats_with_threshold.fe_rr_ratio) > (
            cluster_temporal_stats_with_threshold[
                'higher_fe2rr_threshold_' + str(higher_quantile_threshold_list[i])]), conversion_category_str] = 'high'

        cluster_temporal_stats_with_threshold.loc[((cluster_temporal_stats_with_threshold.fe_rr_ratio) <= (
            cluster_temporal_stats_with_threshold[
                'higher_fe2rr_threshold_' + str(higher_quantile_threshold_list[i])])) & ((
                (cluster_temporal_stats_with_threshold.fe_rr_ratio) >= (
            cluster_temporal_stats_with_threshold[
                'lower_fe2rr_threshold_' + str(
                    lower_quantile_threshold_list[i])]))), conversion_category_str] = 'medium'

        cluster_temporal_stats_with_threshold.loc[(cluster_temporal_stats_with_threshold.g2n_ratio) < (
            cluster_temporal_stats_with_threshold[
                'higher_g2n_threshold_' + str(higher_quantile_threshold_list[i])]), fulfilment_category_str] = 'low'

        cluster_temporal_stats_with_threshold.loc[(cluster_temporal_stats_with_threshold.g2n_ratio) > (
            cluster_temporal_stats_with_threshold[
                'higher_g2n_threshold_' + str(higher_quantile_threshold_list[i])]), fulfilment_category_str] = 'high'

        cluster_temporal_stats_with_threshold.loc[((cluster_temporal_stats_with_threshold.g2n_ratio) <= (
            cluster_temporal_stats_with_threshold[
                'higher_g2n_threshold_' + str(higher_quantile_threshold_list[i])])) & (
                                                      ((cluster_temporal_stats_with_threshold.g2n_ratio) >= (
                                                          cluster_temporal_stats_with_threshold[
                                                              'lower_g2n_threshold_' + str(
                                                                  lower_quantile_threshold_list[
                                                                      i])]))), fulfilment_category_str] = 'medium'

        cluster_temporal_stats_with_threshold['cf_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
            higher_quantile_threshold_list[i])] = (
                cluster_temporal_stats_with_threshold[
                    'conversion_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
                        higher_quantile_threshold_list[i])] + '-' +
                cluster_temporal_stats_with_threshold[
                    'fulfil_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
                        higher_quantile_threshold_list[i])]
        )
        print(cluster_temporal_stats_with_threshold.shape)
        print(cluster_temporal_stats_with_threshold[conversion_category_str].value_counts())
        print(cluster_temporal_stats_with_threshold[fulfilment_category_str].value_counts())
        print('----------------------')


(4432, 31)
medium    1903
low       1266
high      1263
Name: conversion_category_0.25_0.75, dtype: int64
medium    1940
high      1254
low       1238
Name: fulfil_category_0.25_0.75, dtype: int64
----------------------
(4432, 34)
medium    1921
low       1257
high      1254
Name: conversion_category_0.2_0.8, dtype: int64
medium    1954
high      1248
low       1230
Name: fulfil_category_0.2_0.8, dtype: int64
----------------------
(4432, 37)
medium    3166
high       633
low        633
Name: conversion_category_0.15_0.85, dtype: int64
medium    3189
high       626
low        617
Name: fulfil_category_0.15_0.85, dtype: int64
----------------------
(4432, 40)
medium    3166
high       633
low        633
Name: conversion_category_0.12_0.87, dtype: int64
medium    3189
high       626
low        617
Name: fulfil_category_0.12_0.87, dtype: int64
----------------------
(4432, 43)
medium    3166
high       633
low        633
Name: conversion_category_0.1_0.9, dtype: int64
medium    3187
high 

In [123]:
cluster_temporal_stats_with_threshold.head()

Unnamed: 0,pickup_cluster,time_period,week_period,total_fe_count,total_rr_count,total_net_orders,fe_rr_ratio,g2n_ratio,lower_g2n_threshold_0.1,higher_g2n_threshold_0.9,...,cf_category_0.2_0.8,conversion_category_0.15_0.85,fulfil_category_0.15_0.85,cf_category_0.15_0.85,conversion_category_0.12_0.87,fulfil_category_0.12_0.87,cf_category_0.12_0.87,conversion_category_0.1_0.9,fulfil_category_0.1_0.9,cf_category_0.1_0.9
0,Adambakkam,afternoon,Friday,1204,290,194,24.06639,66.666667,68.11,75.055,...,high-low,high,low,high-low,high,low,high-low,high,low,high-low
1,Adambakkam,rest,Wednesday,468,171,124,36.460554,72.093023,46.1201,71.7603,...,high-high,high,high,high-high,high,high,high-high,high,high,high-high
2,Adambakkam,rest,Tuesday,501,183,116,36.454183,63.043478,46.1201,71.7603,...,high-medium,medium,medium,medium-medium,medium,medium,medium-medium,medium,medium,medium-medium
3,Adambakkam,rest,Thursday,698,238,111,34.048641,46.443515,46.1201,71.7603,...,medium-low,medium,medium,medium-medium,medium,medium,medium-medium,medium,medium,medium-medium
4,Adambakkam,rest,Sunday,450,129,93,28.603104,71.538462,46.1201,71.7603,...,low-high,low,medium,low-medium,low,medium,low-medium,low,medium,low-medium


In [124]:
cluster_temporal_stats_with_threshold.to_csv(
    '~/Downloads/cluster_temporal_stats_with_threshold_cf_category_{city}_{service}_{start_date}_{end_date}.csv'.format(
        city = city, service = service, start_date = start_date, end_date = end_date))

## Conclusion

Categories created

In [129]:
for i in range(0, len(lower_quantile_threshold_list), 1):
    conversion_category_str = 'conversion_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
        higher_quantile_threshold_list[i])
    fulfilment_category_str = 'fulfil_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
        higher_quantile_threshold_list[i])
    cf_category_str = 'cf_category_' + str(lower_quantile_threshold_list[i]) + '_' + str(
        higher_quantile_threshold_list[i])

    category_wise_stats = cluster_temporal_stats_with_threshold.groupby(
        [cf_category_str, conversion_category_str, fulfilment_category_str]).agg(
        total_gross_orders=pd.NamedAgg('total_rr_count', 'sum'),
        total_net_orders=pd.NamedAgg('total_net_orders', 'sum'),
        total_fe_count=pd.NamedAgg('total_fe_count', 'sum')
    ).reset_index()

    print('Total Unique Clusters: ', cluster_temporal_stats_with_threshold.pickup_cluster.nunique())
    print(cluster_temporal_stats_with_threshold.pickup_cluster.nunique())
    print('Check count: time period level')
    print(cluster_temporal_stats_with_threshold.groupby([cf_category_str, 'time_period']).agg(
        count_combinations=pd.NamedAgg('pickup_cluster', 'count')))

    print('Check count: time period, week period level')
    print(cluster_temporal_stats_with_threshold.groupby([cf_category_str, 'time_period', 'week_period']).agg(
        count_combinations=pd.NamedAgg('pickup_cluster', 'count')))

    print('Category Wise Stats')
    print('-----------------------------------')
    print(category_wise_stats.shape)
    print(category_wise_stats[conversion_category_str].value_counts())
    print(category_wise_stats[fulfilment_category_str].value_counts())

    total_gross_orders_all_cats = category_wise_stats['total_gross_orders'].sum()
    total_net_orders_all_cats = category_wise_stats['total_net_orders'].sum()
    total_fe_count_all_cats = category_wise_stats['total_fe_count'].sum()

    print('Gross orders Summary')
    print('Total Gross Orders: ', total_gross_orders_all_cats)
    print('Total Net Orders: ', total_net_orders_all_cats)
    print('Total FE count: ', total_fe_count_all_cats)
    print('------------------------------------------------')

    # Put in log
    category_wise_stats['fe_contribution'] = category_wise_stats['total_fe_count'] / total_fe_count_all_cats
    category_wise_stats['gross_order_contribution'] = category_wise_stats[
                                                          'total_gross_orders'] / total_gross_orders_all_cats
    category_wise_stats['net_order_contribution'] = category_wise_stats[
                                                        'total_net_orders'] / total_net_orders_all_cats

Total Unique Clusters:  159
159
Check count: time period level
                                    count_combinations
cf_category_0.25_0.75 time_period                     
high-high             afternoon                     67
                      evening_peak                 121
                      morning_peak                  31
                      rest                         109
high-low              afternoon                     97
                      evening_peak                  34
                      morning_peak                 127
                      rest                          72
high-medium           afternoon                    152
                      evening_peak                 161
                      morning_peak                 158
                      rest                         134
low-high              afternoon                    108
                      evening_peak                  44
                      morning_peak                 190
  

-----