In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Bootstrap is an advanced empirical method of Varience estimation. 
Widely used in A/B testing for establishing Empirical Confidence Intervals. 


# Experiment input:
The analytics team in UBER was asked to Design an experiment of the New Product Feature. It reduces the Waiting Time for a user using new dispatch algoritm. First, the team should choose the metric of the test. Using Average Waiting time as a metric may not be the best option. New feature may descrease an average time for half of the users but increase for the second half at the same time. One of the analyst suggested to use 99% percentile as a metric which is going to represent an idea that for 99% of orders a user waits no longer than N minutes. Experiment will last for 1 week. Although most of such experiments are conducted using Switchback technique, we are going to assume there is no network effect in experiment. Randomization unit == an order. Also, we are going to use Bootrstrap since it enables to take percentile statistic as a metric

In [2]:
# Let's look at our pre-experimental data
df = pd.read_csv('data.csv')
"""
user_id:: unique client id
waiting_time:: the time a passenger waited for the taxi from the moment he placed an order. In seconds
date:: date of an order
"""
df.head(5)

Unnamed: 0,user_id,trip_id,waiting_time,date
0,1,57a9de67-3f61-487a-ac95-5535baee9468,434,2022-12-05
1,1,2ed5e70f-20c7-4b49-9f23-ac778754a5e9,900,2022-12-09
2,1,61521f36-8bc2-4ada-a0f7-9dbb50943975,473,2022-12-10
3,2,89a3b986-24c7-4556-9e35-4b8eda15e333,500,2022-12-07
4,2,edd8ec9e-dedb-42ea-9a11-e2cae30f45f1,879,2022-12-07


In [3]:
print(f'Number of users = {df.user_id.nunique()}')
print(df.waiting_time.describe())

Number of users = 10000
count    30159.000000
mean       477.978713
std        242.818242
min         60.000000
25%        267.000000
50%        479.000000
75%        687.000000
max        900.000000
Name: waiting_time, dtype: float64


In [4]:
np.percentile(df.waiting_time, 99)

892.0

## Now it is time to calculate Test Design:
### Unfortunately we can not estimate sample size using formula 
Since we use percentile as statistic, there is no sample size formula for percentile differences. Howerer we know, that our design should control First Type and Second Type Error probabilty. Let's use 500 orders for each experiment and validate design with synthetic A/A and A/B tests

In [5]:
# A/A test of bootstrap
alpha = 0.05
beta = 0.1
sample_size = 500
trips_list = df.trip_id.unique()

is_difference = [] # To store results of our bootstrap procedures. 0: zero is not in CI, 1: zero is in CI
for i in tqdm(range(5000)): # number of a/a tests
    np.random.shuffle(trips_list) # shuffle our sample each a/a test
    a, b = trips_list[:sample_size], trips_list[len(trips_list) - sample_size:] # simulate that we took a sample=sample_size
    waiting_time_a, waiting_time_b = df[df['trip_id'].isin(a)].waiting_time, df[df['trip_id'].isin(b)].waiting_time
    statistics = []
    for _ in range(5000): # Bootstrap procedure
ж        # take boot samples of our orders with replacement
        a, b = np.random.choice(waiting_time_a, size=sample_size, replace=True), np.random.choice(waiting_time_b, size=sample_size, replace=True)
        delta = np.percentile(b, 99) - np.percentile(a, 99) # calculate our statistic == 99% percentile diff
        statistics.append(delta) # add statistic to list to calculate CI
    # Calculate CI using percentile method
    left_bound, right_bound = np.percentile(statistics, alpha/2*100), np.percentile(statistics, 100-(alpha/2*100))
    if left_bound <= 0 <= right_bound: # Add result to list (pvalues_aa analogue)
        is_difference.append(0)
    else:
        is_difference.append(1)
# Calculate FPR
print(np.mean(is_difference))

100%|███████████████████████████████████████| 5000/5000 [33:59<00:00,  2.45it/s]

0.0324





### Seems like we do control our FPR on Alpha level given Sample size
Let's check power of our test given MDE == 20 seconds, and Test Power = 1-Beta (0.9)

In [6]:
# A/B test of bootstrap
mde = 20

is_difference = [] # To store results of our bootstrap procedures. 0: zero is not in CI, 1: zero is in CI
for i in tqdm(range(5000)): # number of a/a tests
    np.random.shuffle(trips_list) # shuffle our sample each a/a test
    a, b = trips_list[:sample_size], trips_list[len(trips_list) - sample_size:] # simulate that we took a sample=sample_size
    waiting_time_a, waiting_time_b = df[df['trip_id'].isin(a)].waiting_time, df[df['trip_id'].isin(b)].waiting_time
    statistics = []
    for _ in range(5000): # Bootstrap procedure
        # take boot samples of our orders with replacement
        a, b = np.random.choice(waiting_time_a, size=sample_size, replace=True), np.random.choice(waiting_time_b, size=sample_size, replace=True)
        delta = np.percentile(b, 99)-mde - np.percentile(a, 99) # calculate our statistic == 99% percentile diff
        statistics.append(delta) # add statistic to list to calculate CI
    # Calculate CI using percentile method
    left_bound, right_bound = np.percentile(statistics, alpha/2*100), np.percentile(statistics, 100-(alpha/2*100))
    if left_bound <= 0 <= right_bound: # Add result to list (pvalues_aa analogue)
        is_difference.append(0)
    else:
        is_difference.append(1)
# Calculate FPR
print(np.mean(is_difference))

100%|███████████████████████████████████████| 5000/5000 [38:29<00:00,  2.17it/s]

0.9252





# False Positive rate is on Alpha level and The Power of test is on (1-Beta) level
 Which means we are able to conduct the A/B test given MDE, Alpha, Beta and Sample size to test 99th percentile difference
