In [270]:
# This notebook illustrates the variance reduction concept with trip GB as the statistic

# Experiments are typically evaluated on finite samples, the results are subject to sampling error (or generalization error). 
# Standard error tells you how accurate is the statistic estimated from the sample compared to the full population
# When the population is not homogeneous, random sampling approach can often have limited representativeness and samples can be poorly balanced across the population strata.
# Stratified sampling is a well known technique to reduce the standard error (compared to random sampling approach)

In [232]:
import joblib
import pandas as pd, numpy as np

# show reduction in std error for 'trip GB'

In [264]:
# calculate ground truth using full sample
data = pd.read_csv("data/gb_by_city_id_feb_1_feb_7_2022.csv")
mu = data.gross_bookings_usd.sum()/data.num_rows.sum()
mu

5.070192107672136

In [265]:
sample_size = 20
n_trials = 10000

In [269]:
# estimation standard error with random sampling
cities = data.city_id.tolist()

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = set(np.random.choice(cities, sample_size))
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.gross_bookings_usd.sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
std_error

1.9686478331678763

In [236]:
# estimate standard error with stratified sampling

# load cluster data
region = "global"
clusters = joblib.load("clusters/{}.pkl".format(region))
total_cluster_gb = sum([c['cluster_gb'] for c in clusters])

# sample_size= 10
def MIN_CITIES_PER_CLUSTER(gb):
    return 1
#     return 1 if gb > 5 else 1


sample_size_by_cluster = [int(10*c['cluster_gb']/total_cluster_gb) + MIN_CITIES_PER_CLUSTER(c['cluster_gb']) for c in clusters]
print(sample_size_by_cluster, sum(sample_size_by_cluster))

def stratified_sample():
    sampled_cities = set()
    for i, c in enumerate(clusters):
        cities = [city['data.city_id'] for city in c['cities']]
        sampled_cities = sampled_cities | set(np.random.choice(cities, sample_size_by_cluster[i]))
    return sampled_cities

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = stratified_sample()
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.gross_bookings_usd.sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
std_error    

[3, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 20


1.3435206920933553

# show reduction in std error for 'driver cancel rate'

In [237]:
# calculate ground truth using full sample
data = pd.read_csv("data/driver_cancel_rate_by_city_id_march_1_2022.csv")
mu = data.apply(lambda r: r['driver_cancel_rate']*r['num_rows'], axis=1).sum()/data.num_rows.sum()
mu

0.7422799907176918

In [238]:
sample_size = 20
n_trials = 10000

In [239]:
# estimation standard error with random sampling
cities = data.city_id.tolist()

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = set(np.random.choice(cities, sample_size))
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.apply(lambda r: r['driver_cancel_rate']*r['num_rows'], axis=1).sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
std_error

0.12838041932606514

In [240]:
# estimate standard error with stratified sampling

# load cluster data
region = "global"
clusters = joblib.load("clusters/{}.pkl".format(region))
total_cluster_gb = sum([c['cluster_gb'] for c in clusters])

# sample_size= 10
def MIN_CITIES_PER_CLUSTER(gb):
    return 1
#     return 1 if gb > 5 else 1

sample_size_by_cluster = [int(10*c['cluster_gb']/total_cluster_gb) + MIN_CITIES_PER_CLUSTER(c['cluster_gb']) for c in clusters]
print(sample_size_by_cluster, sum(sample_size_by_cluster))

def stratified_sample():
    sampled_cities = set()
    for i, c in enumerate(clusters):
        cities = [city['data.city_id'] for city in c['cities']]
        sampled_cities = sampled_cities | set(np.random.choice(cities, sample_size_by_cluster[i]))
    return sampled_cities

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = stratified_sample()
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.apply(lambda r: r['driver_cancel_rate']*r['num_rows'], axis=1).sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
std_error    

[3, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 20


0.07409033163124304

# results by mega-region

In [241]:
mega_region_to_city_id = pd.read_csv("data/city_level_metrics_1_23_2022_to_1_29_2022_w_metadata_excluding_outliers_add_lat_lng.csv") #https://michelangelo-studio.uberinternal.com/file/d55c6275-b239-4d8b-81d5-ac2962a70d3f
mega_region_to_city_id['city.mega_region'] = mega_region_to_city_id['city.mega_region'].map({'APAC' : 'apac', 'EMEA' : 'emea', 'LatAm' : 'latam', 'US & Canada' : 'usc'})
mega_region_to_city_id = mega_region_to_city_id.groupby('city.mega_region')['data.city_id'].apply(set).to_dict()

In [249]:
region = "usc"

## trip GB

In [250]:
# calculate ground truth using full sample
data = pd.read_csv("data/gb_by_city_id_jan_23_jan_29_2022.csv")
data = data[data.city_id.isin(mega_region_to_city_id[region])]
mu = data.gross_bookings_usd.sum()/data.num_rows.sum()
print(mu)
# estimation standard error with random sampling
cities = data.city_id.tolist()

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = set(np.random.choice(cities, sample_size))
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.gross_bookings_usd.sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
print(std_error)

# estimate standard error with stratified sampling

# load cluster data
clusters = joblib.load("clusters/{}.pkl".format(region))
total_cluster_gb = sum([c['cluster_gb'] for c in clusters])

# sample_size= 10
def MIN_CITIES_PER_CLUSTER(gb):
    return 1
#     return 1 if gb > 5 else 1


target = {'apac': 15, 'emea': 14, 'latam': 15, 'usc': 14}
sample_size_by_cluster = [int(target[region]*c['cluster_gb']/total_cluster_gb) + MIN_CITIES_PER_CLUSTER(c['cluster_gb']) for c in clusters]
print(sample_size_by_cluster, sum(sample_size_by_cluster))

def stratified_sample():
    sampled_cities = set()
    for i, c in enumerate(clusters):
        cities = [city['data.city_id'] for city in c['cities']]
        sampled_cities = sampled_cities | set(np.random.choice(cities, sample_size_by_cluster[i]))
    return sampled_cities

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = stratified_sample()
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.gross_bookings_usd.sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
std_error    

6.614406572907649
1.2362409345080334
[9, 2, 2, 1, 1, 1, 1, 1, 1, 1] 20


0.9561136122460295

## driver cancel rate

In [244]:
# calculate ground truth using full sample
data = pd.read_csv("data/driver_cancel_rate_by_city_id_march_1_2022.csv")
data = data[data.city_id.isin(mega_region_to_city_id[region])]
mu = data.apply(lambda r: r['driver_cancel_rate']*r['num_rows'], axis=1).sum()/data.num_rows.sum()
print(mu)

# estimation standard error with random sampling
cities = data.city_id.tolist()

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = set(np.random.choice(cities, sample_size))
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.apply(lambda r: r['driver_cancel_rate']*r['num_rows'], axis=1).sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
print(std_error)

clusters = joblib.load("clusters/{}.pkl".format(region))
total_cluster_gb = sum([c['cluster_gb'] for c in clusters])

# sample_size= 10
def MIN_CITIES_PER_CLUSTER(gb):
    return 1
#     return 1 if gb > 5 else 1

target = {'apac': 15, 'emea': 14, 'latam': 15, 'usc': 14}
sample_size_by_cluster = [int(target[region]*c['cluster_gb']/total_cluster_gb) + MIN_CITIES_PER_CLUSTER(c['cluster_gb']) for c in clusters]


def stratified_sample():
    sampled_cities = set()
    for i, c in enumerate(clusters):
        cities = [city['data.city_id'] for city in c['cities']]
        sampled_cities = sampled_cities | set(np.random.choice(cities, sample_size_by_cluster[i]))
    return sampled_cities

mu_estimates = []
for trial in range(n_trials):
    sampled_cities = stratified_sample()
    sample_data = data[data.city_id.isin(sampled_cities)]
    mu_estimate = sample_data.apply(lambda r: r['driver_cancel_rate']*r['num_rows'], axis=1).sum()/sample_data.num_rows.sum()
    mu_estimates.append(mu_estimate)
    
std_error = np.sqrt(sum([(mu_estimate-mu)**2 for mu_estimate in mu_estimates])/n_trials)
print(std_error)

0.7936261198368971
0.055376341856574765
0.01698764041156972
