In [1]:
import numpy as np
import pandas as pd
import scipy.stats as sps
import plotly.graph_objs as go

# Доп функции

In [2]:
def get_qq_plot(p_values):
    """Рисует распределение p-value"""
    p_values = np.array(p_values)
    probs = []
    x = [0.01 * i for i in range(101)]
    for i in range(101):
        alpha_step = 0.01 * i
        probs.append(p_values[p_values < alpha_step].shape[0] / p_values.shape[0])
    fig = go.Figure([go.Scatter(x=x, y=probs, mode="markers", name="p_value"),
                 go.Scatter(x=x, y=x, mode="lines", name="uniform")])
    fig.update_layout(height=600, width=600, title="Q-Q plot") 
    return fig

In [3]:
def get_power(p_values, alpha=0.05):
    """Оценка мощности критерия, при условии, что значения p_value взяты при наличии 
    различий в сравниваемых выборках 
    """
    p_values = np.array(p_values)
    return p_values[p_values < alpha].shape[0] / p_values.shape[0] * 100

# Длительность

In [4]:
def duration(k, delta_effect, sigma_1, sigma_2, alpha=0.05, beta=0.2):
    z = sps.norm.ppf(1 - alpha/2) + sps.norm.ppf(1-beta)
    n = (k+1) * z ** 2 * (sigma_1 ** 2 + sigma_2 **2 / k) / (delta_effect ** 2)
    return n

In [5]:
duration(k=1, delta_effect=10, sigma_1=140, sigma_2=140, alpha=0.1, beta=0.2)

4847.124869903497

##### Случай для конверсии

In [6]:
p = 0.005

In [7]:
duration(k=1, delta_effect=0.0001, sigma_1=p*(1-p), sigma_2=p*(1-p), alpha=0.05)

77705.87158998956

In [22]:
alpha = 0.05
beta = 0.5
z = sps.norm.ppf(1 - alpha) + sps.norm.ppf(1 - beta)

x_crit = sps.norm.ppf(1 - alpha)
x_start = np.linspace(-4, x_crit, 200)
x_end = np.linspace(x_crit, 4 + z, 200)
y_start = sps.norm.pdf(x_start)
y_end = sps.norm.pdf(x_end)

y_er2 = sps.norm.pdf(x_start, loc=z)
y_alt = sps.norm.pdf(x_end, loc=z)

fig = go.Figure([go.Scatter(x=x_start, y=y_start, line={"color": 'red'}),
                 go.Scatter(x=x_end, y=y_end, fill='tozeroy', line={"color": 'red'}), 
                 go.Scatter(x=x_start, y=y_er2, fill='tozeroy', line={"color": 'magenta'}), 
                 go.Scatter(x=x_end, y=y_alt, line={"color": 'magenta'})
                ])
fig.add_vline(x=0, line_width=1, line_color='red',
              annotation={'text': f"H0=0", 'font_color': 'red', 'xshift': 5, 'yshift': -180})
fig.add_vline(x=z, line_width=1, line_color='magenta',
              annotation={'text': f"H1={z:.3}", 'font_color': 'magenta', 'xshift': 5, 'yshift': -180})
fig.add_vline(x=x_crit, line_width=1, line_color='red',
              annotation={'text': f"{x_crit:.3}", 'font_color': 'red', 'xshift': -35})

fig.add_annotation(x=3.3, y=0.03, text=f'{alpha=:.3}', font_color='red', xshift=0, showarrow=False)
fig.add_annotation(x=-0.5, y=0.03, text=f'{beta=:.3}', font_color='magenta', xshift=0, showarrow=False)
fig.update_layout(
    height=600, 
    width=800, 
    title="Длительность",
    xaxis={'title': 'Значение'},
    yaxis={'title': 'Плотность вероятности'},
    font_size=14,
    title_font_size=26,
    showlegend=False
    )

# Ratio-метрика + Линеаризация

In [34]:
x = sps.expon.rvs(loc=100, scale=100, size=1000)
go.Figure([go.Histogram(x=x, nbinsx=100)])

Дальше приведены 2 блока кода: в пером в генерируемых выборках нет изменений, во втором - изменения есть

##### Изменений нет

In [35]:
%%time
n_exp = 1000
p_values = []
p_values_lin = []
for _ in range(n_exp):
    records = []
    for i in range(100):
        n_views = int(sps.expon.rvs(loc=100, scale=100))
        clicks = sps.bernoulli.rvs(p=0.05, size=n_views)
        records.append([n_views, np.sum(clicks), np.sum(clicks)/ n_views, "A"])
    for i in range(100):
        n_views = int(sps.expon.rvs(loc=100, scale=100))
        clicks = sps.bernoulli.rvs(p=0.05, size=n_views)
        records.append([n_views, np.sum(clicks), np.sum(clicks)/ n_views, "B"])
    df_data = pd.DataFrame(records, columns=["views", "clicks", "cr", "group"])
    
    cr_A = df_data[df_data["group"] == "A"]["clicks"].sum() / df_data[df_data["group"] == "A"]["views"].sum()
    df_data["cr_lin"] = df_data["clicks"] - cr_A * df_data["views"]

    x_a = df_data[df_data["group"] == "A"]["cr"]
    x_b = df_data[df_data["group"] == "B"]["cr"]
    p_value = sps.ttest_ind(x_a, x_b).pvalue
    p_values.append(p_value)
    
    x_a_lin = df_data[df_data["group"] == "A"]["cr_lin"]
    x_b_lin = df_data[df_data["group"] == "B"]["cr_lin"]
    p_value_lin = sps.ttest_ind(x_a_lin, x_b_lin).pvalue
    p_values_lin.append(p_value_lin)

CPU times: user 25.5 s, sys: 3.15 ms, total: 25.6 s
Wall time: 25.6 s


##### Оценка корректности

In [36]:
get_qq_plot(p_values)

In [37]:
get_qq_plot(p_values_lin)

##### Изменения есть

In [38]:
%%time
n_exp = 1000
p_values = []
p_values_lin = []
for _ in range(n_exp):
    records = []
    for i in range(100):
        n_views = int(sps.expon.rvs(loc=100, scale=100))
        clicks = sps.bernoulli.rvs(p=0.05, size=n_views)
        records.append([n_views, np.sum(clicks), np.sum(clicks)/ n_views, "A"])
    for i in range(100):
        n_views = int(sps.expon.rvs(loc=100, scale=100))
        clicks = sps.bernoulli.rvs(p=0.055, size=n_views)
        records.append([n_views, np.sum(clicks), np.sum(clicks)/ n_views, "B"])
    df_data = pd.DataFrame(records, columns=["views", "clicks", "cr", "group"])
    cr_A = df_data[df_data["group"] == "A"]["clicks"].sum() / df_data[df_data["group"] == "A"]["views"].sum()
    df_data["cr_lin"] = df_data["clicks"] - cr_A * df_data["views"]

    x_a = df_data[df_data["group"] == "A"]["cr"]
    x_b = df_data[df_data["group"] == "B"]["cr"]
    p_value = sps.ttest_ind(x_a, x_b).pvalue
    p_values.append(p_value)
    
    x_a_lin = df_data[df_data["group"] == "A"]["cr_lin"]
    x_b_lin = df_data[df_data["group"] == "B"]["cr_lin"]
    p_value_lin = sps.ttest_ind(x_a_lin, x_b_lin).pvalue
    p_values_lin.append(p_value_lin)

CPU times: user 26 s, sys: 448 µs, total: 26 s
Wall time: 26 s


##### Оценка мощности

In [39]:
get_power(p_values)

51.4

In [40]:
get_power(p_values_lin)

59.599999999999994

## Бакетизация

In [41]:
%%time
n_exp = 1000
p_values = []
p_values_lin = []
for _ in range(n_exp):
    records = []
    for i in range(2000):
        n_views = int(sps.expon.rvs(loc=100, scale=100))
        clicks = sps.bernoulli.rvs(p=0.05, size=n_views)
        records.append([n_views, np.sum(clicks), np.sum(clicks)/ n_views, "A"])
    for i in range(2000):
        n_views = int(sps.expon.rvs(loc=100, scale=100))
        clicks = sps.bernoulli.rvs(p=0.05, size=n_views)
        records.append([n_views, np.sum(clicks), np.sum(clicks)/ n_views, "B"])
    df_data = pd.DataFrame(records, columns=["views", "clicks", "cr", "group"])

    # Бакетизация
    mask_A = df_data['group'] == 'A'
    df_data.loc[mask_A, 'bucket'] = np.arange(1, 101).repeat(2000//100)
    mask_B = df_data['group'] == 'B'
    df_data.loc[mask_B, 'bucket'] = np.arange(101, 201).repeat(2000//100)

    df_bucket = df_data.groupby(['group', 'bucket'])[['views', 'clicks']].sum().reset_index()
    df_bucket['cr_bucket'] = df_bucket['clicks'] / df_bucket['views']

    x_a_bucket = df_bucket[df_bucket["group"] == "A"]["cr_bucket"]
    x_b_bucket = df_bucket[df_bucket["group"] == "B"]["cr_bucket"]
    p_value = sps.ttest_ind(x_a_bucket, x_b_bucket).pvalue
    p_values.append(p_value)

CPU times: user 7min 2s, sys: 20 ms, total: 7min 2s
Wall time: 7min 3s


In [42]:
get_qq_plot(p_values)

# Стратификация

In [48]:
def ttest_strat(df_A, df_B):
    """Функция для оценки стратифицированного среднего"""
    T = df_A["payments"].mean() - df_B["payments"].mean()
    D_A = (df_A.groupby("city")["payments"].std(ddof=1) ** 2 * df_A.groupby("city")["payments"].count() / (df_A.shape[0]**2)).sum()
    D_B = (df_B.groupby("city")["payments"].std(ddof=1) ** 2 * df_B.groupby("city")["payments"].count() / (df_B.shape[0]**2)).sum()
    
    T = T / np.sqrt(D_A + D_B)

    T = np.abs(T)
    p_value = 2 * (1 - sps.norm.cdf(np.abs(T)))
    return p_value

In [46]:
(df_A.groupby("city")["payments"].std(ddof=1) * df_A.groupby("city")["payments"].count() / (df_A.shape[0])).sum()

17.269488176577127

In [47]:
df_A["payments"].mean()

93.99979866291935

##### Различий между выборками нет

In [45]:
%%time
n_exp = 1000
p_values = []
p_values_strat = []
size = 1000
for _ in range(n_exp):
    df_A = pd.DataFrame()
    df_A["user"] = [f"A_{x:5}" for x in range(size)]
    df_A["city"] = pd.Series([1]*int(size*0.3) + [0]*int(size*0.7)).map({0: "Moscow", 1: "Tula"})
    mask_Moscow = df_A["city"] == "Moscow"
    df_A.loc[mask_Moscow, "payments"] = sps.norm.rvs(loc=100, scale=20, size=mask_Moscow.sum())
    mask_Tula = df_A["city"] == "Tula"
    df_A.loc[mask_Tula, "payments"] = sps.norm.rvs(loc=80, scale=10, size=mask_Tula.sum()) 
    
    df_B = pd.DataFrame()
    df_B["user"] = [f"B_{x:5}" for x in range(size)]
    df_B["city"] = pd.Series([1]*int(size*0.3) + [0]*int(size*0.7)).map({0: "Moscow", 1: "Tula"})
#pd.Series(sps.bernoulli.rvs(p=0.3, size=size)).map({0: "Moscow", 1: "Tula"})
    mask_Moscow = df_B["city"] == "Moscow"
    df_B.loc[mask_Moscow, "payments"] = sps.norm.rvs(loc=100, scale=20, size=mask_Moscow.sum())
    mask_Tula = df_B["city"] == "Tula"
    df_B.loc[mask_Tula, "payments"] = sps.norm.rvs(loc=80, scale=10, size=mask_Tula.sum())
    
    p_values_strat.append(ttest_strat(df_A, df_B))
    p_values.append(sps.ttest_ind(df_A["payments"], df_B["payments"]).pvalue)
    
    

CPU times: user 11.6 s, sys: 12 µs, total: 11.6 s
Wall time: 11.6 s


#### Оценка корректности

In [49]:
get_qq_plot(p_values)

In [50]:
get_qq_plot(p_values_strat)

##### Есть различия в выборках

In [51]:
%%time
n_exp = 1000
p_values = []
p_values_strat = []
size = 1000
for _ in range(n_exp):
    df_A = pd.DataFrame()
    df_A["user"] = [f"A_{x:5}" for x in range(size)]
    df_A["city"] = pd.Series([1]*int(size*0.3) + [0]*int(size*0.7)).map({0: "Moscow", 1: "Tula"})
    mask_Moscow = df_A["city"] == "Moscow"
    df_A.loc[mask_Moscow, "payments"] = sps.norm.rvs(loc=100, scale=20, size=mask_Moscow.sum())
    mask_Tula = df_A["city"] == "Tula"
    df_A.loc[mask_Tula, "payments"] = sps.norm.rvs(loc=80, scale=10, size=mask_Tula.sum()) 
    
    df_B = pd.DataFrame()
    df_B["user"] = [f"B_{x:5}" for x in range(size)]
    df_B["city"] = pd.Series([1]*int(size*0.3) + [0]*int(size*0.7)).map({0: "Moscow", 1: "Tula"})
#pd.Series(sps.bernoulli.rvs(p=0.3, size=size)).map({0: "Moscow", 1: "Tula"})
    mask_Moscow = df_B["city"] == "Moscow"
    df_B.loc[mask_Moscow, "payments"] = sps.norm.rvs(loc=103, scale=20, size=mask_Moscow.sum())
    mask_Tula = df_B["city"] == "Tula"
    df_B.loc[mask_Tula, "payments"] = sps.norm.rvs(loc=83, scale=10, size=mask_Tula.sum())
    
    p_values_strat.append(ttest_strat(df_A, df_B))
    p_values.append(sps.ttest_ind(df_A["payments"], df_B["payments"]).pvalue)
    
    

CPU times: user 11.1 s, sys: 19 µs, total: 11.1 s
Wall time: 11.1 s


##### Оценка мощности

In [52]:
get_power(p_values)

93.89999999999999

In [53]:
get_power(p_values_strat)

96.6

# CUPED

Можно зафиксировать значение на начальных данных, тогда результаты будут корректнее. Работает лучше, если есть корреляция, иначе почти такой же результат.

##### Нет различий в выборках

In [54]:
%%time
n_exp = 1000
p_values = []
p_values_cuped = []
size = 1000

# Pre_experiment
pre_exp = sps.norm.rvs(loc=100, scale=20, size=size)
for _ in range(n_exp):
    
    
    
    df_A = pd.DataFrame()
    df_A["user"] = [f"A_{x:5}" for x in range(size)]
    df_A["pre_exp"] = pre_exp
    df_A["payments"] = sps.expon.rvs(loc=100, scale=100, size=size)
    
    df_B = pd.DataFrame()
    df_B["pre_exp"] = pre_exp
    df_B["user"] = [f"B_{x:5}" for x in range(size)]
    df_B["payments"] = sps.expon.rvs(loc=100, scale=100, size=size)
    
    p_values.append(sps.ttest_ind(df_A["payments"], df_B["payments"]).pvalue)
    
    x_a = df_A["pre_exp"]
    x_b = df_B["pre_exp"]
    y_a = df_A["payments"]
    y_b = df_B["payments"]
    theta = np.cov(x_a, y_a)[0,1] / np.std(x_a)**2
    
    df_A["payments_cuped"] = df_A["payments"] - theta * df_A["pre_exp"]
    df_B["payments_cuped"] = df_B["payments"] - theta * df_B["pre_exp"]
    
    p_values_cuped.append(sps.ttest_ind(df_A["payments_cuped"], df_B["payments_cuped"]).pvalue)
    
    

CPU times: user 5.69 s, sys: 0 ns, total: 5.69 s
Wall time: 5.69 s


##### Оценка корректности

In [55]:
get_qq_plot(p_values_cuped)

##### Есть независимые изменения в выборках

In [59]:
%%time
n_exp = 1000
p_values = []
p_values_cuped = []
size = 1000

# Pre_experiment
pre_exp = sps.norm.rvs(loc=100, scale=20, size=size)
for _ in range(n_exp):
    
    
    
    df_A = pd.DataFrame()
    df_A["user"] = [f"A_{x:5}" for x in range(size)]
    df_A["pre_exp"] = pre_exp
    df_A["payments"] = sps.expon.rvs(loc=100, scale=100, size=size)
    
    df_B = pd.DataFrame()
    df_B["pre_exp"] = pre_exp
    df_B["user"] = [f"B_{x:5}" for x in range(size)]
    df_B["payments"] = sps.expon.rvs(loc=110, scale=100, size=size)
    
    p_values.append(sps.ttest_ind(df_A["payments"], df_B["payments"]).pvalue)
    
    x_a = df_A["pre_exp"]
    x_b = df_B["pre_exp"]
    y_a = df_A["payments"]
    y_b = df_B["payments"]
    theta = np.cov(x_a, y_a)[0,1] / np.std(x_a)**2
    
    df_A["payments_cuped"] = df_A["payments"] - theta * df_A["pre_exp"]
    df_B["payments_cuped"] = df_B["payments"] - theta * df_B["pre_exp"]
    
    p_values_cuped.append(sps.ttest_ind(df_A["payments_cuped"], df_B["payments_cuped"]).pvalue)
    
    

CPU times: user 6.16 s, sys: 3.98 ms, total: 6.16 s
Wall time: 6.21 s


In [60]:
get_power(p_values)

61.5

In [61]:
get_power(p_values_cuped)

61.5

##### Есть коррелирующие изменения в выборках

In [62]:
%%time
n_exp = 1000
p_values = []
p_values_cuped = []
size = 1000

# Pre_experiment
pre_exp = sps.norm.rvs(loc=100, scale=20, size=size)
for _ in range(n_exp):
    
    
    
    df_A = pd.DataFrame()
    df_A["user"] = [f"A_{x:5}" for x in range(size)]
    df_A["pre_exp"] = pre_exp
    df_A["payments"] =  sps.norm.rvs(loc=1, scale=0.1, size=size) * df_A["pre_exp"]
    
    df_B = pd.DataFrame()
    df_B["pre_exp"] = pre_exp
    df_B["user"] = [f"B_{x:5}" for x in range(size)]
    df_B["payments"] = sps.norm.rvs(loc=1.01, scale=0.1, size=size) * df_B["pre_exp"]
    
    p_values.append(sps.ttest_ind(df_A["payments"], df_B["payments"]).pvalue)
    
    x_a = df_A["pre_exp"]
    x_b = df_B["pre_exp"]
    y_a = df_A["payments"]
    y_b = df_B["payments"]
    theta = np.cov(x_a, y_a)[0,1] / np.std(x_a)**2
    
    df_A["payments_cuped"] = df_A["payments"] - theta * df_A["pre_exp"]
    df_B["payments_cuped"] = df_B["payments"] - theta * df_B["pre_exp"]
    
    p_values_cuped.append(sps.ttest_ind(df_A["payments_cuped"], df_B["payments_cuped"]).pvalue)
    
    

CPU times: user 6.83 s, sys: 8.02 ms, total: 6.84 s
Wall time: 6.85 s


In [63]:
get_power(p_values)

2.5

In [64]:
get_power(p_values_cuped)

59.199999999999996

# Выбросы

In [65]:
n_exp = 1000
p_values = []
p_values_log = []
size = 1000

for _ in range(n_exp):
    x_a = np.append(sps.expon.rvs(loc=100, scale=100, size=size), np.array([1000, 2000]))
    x_b = np.append(sps.expon.rvs(loc=100, scale=100, size=size), np.array([1000, 2000]))
    x_a_log = np.log(x_a)
    x_b_log = np.log(x_b)
    
    p_values.append(sps.ttest_ind(x_a, x_b).pvalue)
    p_values_log.append(sps.ttest_ind(x_a_log, x_b_log).pvalue)

In [66]:
get_qq_plot(p_values)

In [67]:
get_qq_plot(p_values_log)

##### Есть изменения в выборках

In [68]:
n_exp = 1000
p_values = []
p_values_log = []
size = 1000

for _ in range(n_exp):
    x_a = np.append(sps.expon.rvs(loc=100, scale=100, size=size), np.array([1000, 2000]))
    x_b = np.append(sps.expon.rvs(loc=100, scale=110, size=size), np.array([1000, 2000]))
    x_a_log = np.log(x_a)
    x_b_log = np.log(x_b)
    
    p_values.append(sps.ttest_ind(x_a, x_b).pvalue)
    p_values_log.append(sps.ttest_ind(x_a_log, x_b_log).pvalue)

In [69]:
get_power(p_values)

41.8

In [70]:
get_power(p_values_log)

50.1

### Особый пример для логарифмирования

In [71]:
# Изначальные соотношения
sample_control = [3] * 30 + [10] * 30 + [200] * 10 + [1200]
sample_test    = [8] * 30 + [20] * 30 + [100] * 10 + [1000]
# Добавим немного случайности, чтобы сохранить соотношения
sample_control = np.array(sample_control) + sps.norm().rvs(len(sample_control))
sample_test    = np.array(sample_test) + sps.norm().rvs(len(sample_test))
 

In [72]:
# Средние
sample_control.mean(), sample_test.mean()
# Output: (50.607336258084835, 39.952182594778115)

(50.43720260072933, 40.262545123529904)

In [73]:
np.log(sample_control + 1).mean(), np.log(sample_test + 1).mean()
# Output: (2.448935051567075, 2.953888940122415)
 

(2.384831128209303, 2.9666351806865916)

In [74]:
# Проверка гипотез
sps.ttest_ind(sample_control, sample_test, alternative='less', equal_var=False)
# Output: Ttest_indResult(statistic=0.45984895048111585, pvalue=0.676809082675744)

Ttest_indResult(statistic=0.4392414118566786, pvalue=0.6693970304270289)

In [75]:
sps.ttest_ind(np.log(sample_control + 1), np.log(sample_test + 1), alternative='less', equal_var=False)
# Output: Ttest_indResult(statistic=-2.5261569510816813, pvalue=0.006404244890592501)

Ttest_indResult(statistic=-2.8297413404932863, pvalue=0.002732661781553772)