In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

df1 = pd.read_csv('synthetic_gmv_data_1.1.csv')
df1 = df1.groupby('user_id').sum()
mean = df1['gmv'].mean()
var = df1['gmv'].var(ddof=1)

print(f'{mean:.3f} {var:.3f}')

2750.847 3646228.409


In [2]:
e_proc = 0.01
alpha = 0.05
beta = 0.2
k = 3 # k = m/n, m - контроль, n - тест (m = 3n)

# Задаем распределение
z = stats.norm(loc=0, scale=1)
z1 = z.ppf(1 - alpha / 2)
z2 = z.ppf(1 - beta)
m = (4 * var) * ((z1 + z2) ** 2) / ((e_proc * mean) ** 2)
n = m/3
print(math.ceil(m + n))

201706


In [3]:
df2 = pd.read_csv('synthetic_gmv_data_1.2.csv')
df2 = df2.groupby('user_id').agg({'gmv': 'sum', 'group_name': 'first'})
test_group = df2[df2['group_name'] == 'test']['gmv']
control_group = df2[df2['group_name'] == 'control']['gmv']

t_stat, p_value = stats.ttest_ind(test_group, control_group, equal_var=False)
print(f'{t_stat:.3f} {p_value:.3f}')

2.360 0.018


In [4]:
df3 = pd.read_csv('synthetic_gmv_data_1.3.csv')
df3

Unnamed: 0,gmv_hist,gmv_exp,group_name
0,200.78,123.19,test
1,363.80,134.49,control
2,39.93,116.72,control
3,150.99,177.67,control
4,208.93,65.30,test
...,...,...,...
249995,221.87,65.14,test
249996,307.51,183.22,test
249997,283.07,309.94,test
249998,76.17,121.67,test


In [11]:
df3 = pd.read_csv('synthetic_gmv_data_1.3.csv')

X = df3['gmv_hist']
Y = df3['gmv_exp']
theta = X.cov(Y) / X.var()
mean_X = X.mean()

df3['gmv_cuped'] = Y - theta * X + theta * mean_X

#CUPED
test_group = df3[df3['group_name'] == 'test']['gmv_cuped']
control_group = df3[df3['group_name'] == 'control']['gmv_cuped']

_, p_value1 = stats.ttest_ind(test_group, control_group, equal_var=False)

#std
test_group = df3[df3['group_name'] == 'test']['gmv_exp']
control_group = df3[df3['group_name'] == 'control']['gmv_exp']

_, p_value2 = stats.ttest_ind(test_group, control_group, equal_var=False)

print(f'{p_value2:.3f} {p_value1:.3f}')

0.233 0.047


In [24]:
df2 = pd.read_csv('synthetic_gmv_data_1.2.csv')
df2 = df2.groupby('user_id').agg({'user_id' : 'count', 'gmv': 'sum', 'group_name': 'first'}).rename(columns={'user_id' : 'count'}).reset_index()

X_mean = df2['gmv'].mean()
Y_mean = df2['count'].mean()

df2['lin'] = X_mean / Y_mean + df2['gmv'] / Y_mean - X_mean * df2['count'] / Y_mean ** 2

#LIN
test_group = df2[df2['group_name'] == 'test']['lin']
control_group = df2[df2['group_name'] == 'control']['lin']
t_stat, p_value = stats.ttest_ind(test_group, control_group, equal_var=False)
print(f'{t_stat:.3f} {p_value:.3f}')


2.344 0.019


In [35]:
from scipy.stats import t

def safe_divide(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return np.nan
    
def delta_var(numerator, denominator):
    """
    Функция для расчета дисперсии дельта-методом, numerator - вектор числитель, denominator - вектор знаменатель
    """
    x = numerator
    y = denominator
    n = len(x)
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    var_x = np.var(x, ddof=1)
    var_y = np.var(y, ddof=1)
    cov_xy = np.cov(x, y, ddof=1)[0][1]    
    delta_var = safe_divide(safe_divide(var_x,mu_y**2)  - 2*cov_xy*safe_divide(mu_x,mu_y**3) + var_y*safe_divide(mu_x**2,mu_y**4), n)
    return delta_var

df2 = pd.read_csv('synthetic_gmv_data_1.2.csv')
df2 = df2.groupby('user_id').agg({'user_id' : 'count', 'gmv': 'sum', 'group_name': 'first'}).rename(columns={'user_id': 'Y', 'gmv': 'X'})
test_group = df2[df2['group_name'] == 'test'][['X', 'Y']]
control_group = df2[df2['group_name'] == 'control'][['X', 'Y']]

def my_mean(df):
    numerator = df['X'].mean()
    denom = df['Y'].mean()
    return numerator / denom

test_x_mean = test_group['X'].mean()
test_y_mean = test_group['Y'].mean()
control_x_mean = control_group['X'].mean()
control_y_mean = control_group['Y'].mean()

t_stat = ((test_x_mean/test_y_mean) - (control_x_mean/control_y_mean)) / np.sqrt(delta_var(test_group['X'], test_group['Y']) + delta_var(control_group['X'], control_group['Y']))

dfree = len(df2) - 2
p_val = 2 * (1 - t.cdf(abs(t_stat), df=dfree))

print(f'{t_stat:.3f} {p_val:.3f}')

2.344 0.019


In [49]:
from scipy.stats import t

df2 = pd.read_csv('synthetic_gmv_data_1.2.csv')
df2 = df2.groupby('user_id').agg({'user_id' : 'count', 'gmv': 'sum', 'group_name': 'first'}).rename(columns={'user_id': 'Y', 'gmv': 'X'})
test_group = df2[df2['group_name'] == 'test'][['X', 'Y']]
control_group = df2[df2['group_name'] == 'control'][['X', 'Y']]

alpha = 0.05

X_t_mean = test_group['X'].mean() #X_t_mean
X_c_mean = control_group['X'].mean() #X_c_mean

Y_t_mean = test_group['Y'].mean()
Y_c_mean = control_group['Y'].mean()

sigma_X_t = np.sqrt(test_group['X'].var(ddof=1))
sigma_X_c = np.sqrt(control_group['X'].var(ddof=1))

sigma_Y_t = np.sqrt(test_group['Y'].var(ddof=1))
sigma_Y_c = np.sqrt(control_group['Y'].var(ddof=1))

cov_xy_t = np.cov(test_group['X'], test_group['Y'], ddof=1)[0][1]
cov_xy_c = np.cov(control_group['X'], control_group['Y'], ddof=1)[0][1]

n = len(test_group)
m = len(control_group)

dfree = len(df2) - 2

t_crit = t.ppf(1 - alpha / 2, df=dfree)
print(t_crit)

# 1, delta gmv
test_var = test_group['X'].var(ddof=1)
control_var = control_group['X'].var(ddof=1)
sigma1 = np.sqrt(test_var / n + control_var / m)
delta1 = X_t_mean - X_c_mean
left1 = delta1 - t_crit * sigma1
right1 = delta1 + t_crit * sigma1

print(sigma1)
print(delta1)
print(f'[{left1:.3f}, {right1:.3f}]')
print('\n\n\n')

# 2, % gmv
delta2 = 100 * (X_t_mean - X_c_mean) / X_c_mean
sigma2 = np.sqrt((1/X_c_mean**2) * (sigma_X_t**2/n + X_t_mean**2 * sigma_X_c**2 / (m * X_c_mean**2)))
left2 = delta2 - 100 * t_crit * sigma2
right2 = delta2 + 100 * t_crit * sigma2

print(sigma2)
print(delta2)
print(f'[{left2:.3f}, {right2:.3f}]')
print('\n\n\n')

# 3, delta aov
delta3 = X_t_mean / Y_t_mean - X_c_mean / Y_c_mean
var_t = (1/n) * (1/Y_t_mean**2) * (sigma_X_t**2 - 2 * cov_xy_t * X_t_mean / Y_t_mean + X_t_mean**2 / Y_t_mean**2 * sigma_Y_t**2)
var_c = (1/m) * (1/Y_c_mean**2) * (sigma_X_c**2 - 2 * cov_xy_c * X_c_mean / Y_c_mean + X_c_mean**2 / Y_c_mean**2 * sigma_Y_c**2)
sigma3 = np.sqrt(var_t + var_c)
left3 = delta3 - t_crit * sigma3
right3 = delta3 + t_crit * sigma3
print(sigma3)
print(delta3)
print(f'[{left3:.3f}, {right3:.3f}]')
print('\n\n\n')

# 4, % aov
delta4 = 100 * (X_t_mean/Y_t_mean - X_c_mean/Y_c_mean) / (X_c_mean/Y_c_mean)
R_t = X_t_mean / Y_t_mean
R_c = X_c_mean / Y_c_mean
sigma4 = np.sqrt(var_t/R_c**2 + R_t**2/R_c**4 * var_c)
left4 = delta4 - 100 * t_crit * sigma4
right4 = delta4 + 100 * t_crit * sigma4
print(sigma4)
print(delta4)
print(f'[{left4:.3f}, {right4:.3f}]')
print('\n\n\n')

print(f'[{left1:.3f}, {right1:.3f}] [{left2:.3f}, {right2:.3f}] [{left3:.3f}, {right3:.3f}] [{left4:.3f}, {right4:.3f}]')

1.9599760640638682
9.927658474437857
23.433367596802327
[3.975, 42.891]




0.003493864194626297
0.823027103484432
[0.138, 1.508]




1.6991013067148881
3.9824757316729347
[0.652, 7.313]




0.002429934490873236
0.5687435358170613
[0.092, 1.045]




[3.975, 42.891] [0.138, 1.508] [0.652, 7.313] [0.092, 1.045]
