In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, mannwhitneyu

from tqdm.auto import tqdm

In [2]:
def get_bootstrap(
    data_column_1,
    data_column_2,
    boot_it = 1000,
    statistic = np.mean,
    bootstrap_conf_level = 0.95
):
    boot_len = max([len(data_column_1), len(data_column_2)])
    boot_data = []
    for i in tqdm(range(boot_it)):
        samples_1 = data_column_1.sample(
            boot_len, 
            replace = True
        ).values
        
        samples_2 = data_column_2.sample(
            boot_len, 
            replace = True
        ).values
        
        boot_data.append(statistic(samples_1-samples_2))
        
    pd_boot_data = pd.DataFrame(boot_data)
        
    p_1 = norm.cdf(
        x = 0, 
        loc = np.mean(boot_data), 
        scale = np.std(boot_data)
    )
    p_2 = norm.cdf(
        x = 0, 
        loc = -np.mean(boot_data), 
        scale = np.std(boot_data)
    )
    p_value = min(p_1, p_2) * 2
    
    return {"p_value": p_value}

In [3]:
df = pd.read_csv('C:/Users/I.Gromenko/hw_bootstrap (1).csv', decimal=",", sep=";")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,value,experimentVariant
0,1,10.380495,Control
1,2,9.546867,Control
2,3,11.088215,Control
3,4,10.147274,Control
4,5,9.789808,Control


In [5]:
x = df['value'][(df['experimentVariant'] == 'Control')]
y = df['value'][(df['experimentVariant'] == 'Treatment')]

MannW = mannwhitneyu(x, y, alternative = 'two-sided')

boots_mean = get_bootstrap(x, y)
boots_median = get_bootstrap(x, y, statistic = np.median)

print(f'Bootstrap mean: {boots_mean["p_value"]}, Bootstrap median: {boots_median["p_value"]}, MannWhitney: {MannW.pvalue}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Bootstrap mean: 0.038408860833676235, Bootstrap median: 0.8847619239546752, MannWhitney: 0.8592148582485579


In [None]:
#P-value in bootstrap comparing means lets us deny the 0 hypothesis, 
#however median p-value is close to Mann-Whitney criteria, in general Mann-Whitney compares ranges of the distributions
#which are not so different as well as medians in bootstrap
#In this case it is better to use bootstrap, as it keep the sampling variance, which is not included while using MannWhitney