In [51]:
import os
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from scipy import stats

In [83]:
plt.rcParams['legend.markerscale'] = 1.5
plt.rcParams['legend.handletextpad'] = 0.5
plt.rcParams['legend.labelspacing'] = 0.4
plt.rcParams['legend.borderpad'] = 0.5
plt.rcParams['font.size'] = 12
plt.rcParams['font.serif'] = 'Times New Roman'
plt.rcParams['axes.labelsize'] = 22
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['figure.figsize'] = (10, 6)

plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
plt.rc('legend', fontsize=22)

### Task 1. AB-test with CUPED

- Метрика — средняя выручка с клиента.
- Ковариата - выручка пользователей за 4 недели до эксперимента.
- Даты эксперимента - с 2022-04-25 по 2022-05-02.

In [84]:
df_users = pd.read_csv('../data/experiment_users_cuped.csv')
df_sales = pd.read_csv('../data/2022-05-03T12_df_sales.csv')

df_users.head()

Unnamed: 0,user_id,pilot
0,a9a6e8,0
1,23420a,0
2,cbc468,0
3,583c90,0
4,19ce47,0


In [85]:
df_sales['date'] = pd.to_datetime(df_sales['date'])
df_sales.head(10)

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
0,1000001,2022-02-04 10:00:24,1,0,720,1c1543
1,1000002,2022-02-04 10:02:28,1,1,930,a9a6e8
2,1000003,2022-02-04 10:02:35,3,1,1980,23420a
3,1000004,2022-02-04 10:03:06,1,1,750,3e8ed5
4,1000005,2022-02-04 10:03:23,1,1,870,cbc468
5,1000006,2022-02-04 10:04:55,3,2,2400,583c90
6,1000007,2022-02-04 10:05:14,1,0,720,e3876e
7,1000008,2022-02-04 10:05:28,1,0,540,ee3fcb
8,1000009,2022-02-04 10:10:29,2,0,1560,67ee0e
9,1000010,2022-02-04 10:10:33,4,1,2730,19ce47


In [86]:
start_exp_dt = datetime(2022, 4, 25)
end_exp_dt = start_exp_dt + timedelta(days=7)
print(end_exp_dt)

2022-05-02 00:00:00


In [87]:
df_metrics = (
    df_sales
    [(df_sales['date'] >= start_exp_dt) & (df_sales['date'] < end_exp_dt)]
    .groupby('user_id')[['price']].sum()
    .rename(columns={'price': 'metric'})
)

In [88]:
df_metrics.head()

Unnamed: 0_level_0,metric
user_id,Unnamed: 1_level_1
0000e4,840
000112,1380
0001ff,720
00045f,720
000470,2280


In [89]:
start_cov_dt = start_exp_dt - timedelta(days=28)
print(start_cov_dt)

2022-03-28 00:00:00


In [90]:
df_cov = (
    df_sales
    [(df_sales['date'] >= start_cov_dt) & (df_sales['date'] < start_exp_dt)]
    .groupby('user_id')[['price']].sum()
    .rename(columns={'price': 'cov_4week'})
)
print(f'Size:', df_cov.shape[0])
df_cov.head()

Size: 79808


Unnamed: 0_level_0,cov_4week
user_id,Unnamed: 1_level_1
0000d4,720
0000de,1320
0000e7,3840
000152,780
0001ff,720


In [91]:
df_data = pd.merge(
    df_users,
    df_metrics.reset_index(),
    on='user_id',
    how='left'
)




df_exp = pd.merge(
    df_data, 
    df_cov.reset_index(), 
    on='user_id',
    how='left'
)
print(f'Size:', df_exp.shape[0])
df_exp.head()

Size: 109367


Unnamed: 0,user_id,pilot,metric,cov_4week
0,a9a6e8,0,930.0,900.0
1,23420a,0,,
2,cbc468,0,,
3,583c90,0,2490.0,7350.0
4,19ce47,0,,


In [92]:
df_exp.isnull().mean()

user_id      0.000000
pilot        0.000000
metric       0.744448
cov_4week    0.270273
dtype: float64

In [93]:
df_exp.fillna(0, inplace=True)

In [94]:
df_exp[['metric', 'cov_4week']].corr()

Unnamed: 0,metric,cov_4week
metric,1.0,0.17654
cov_4week,0.17654,1.0


In [95]:
df_exp['pilot'].value_counts()

pilot
0    54736
1    54631
Name: count, dtype: int64

In [96]:
def calculate_theta(y_control, y_pilot, x_control, x_pilot):
    """Вычисляем Theta по данным двух групп.

    y_control - значения метрики во время пилота на контрольной группе
    y_pilot - значения метрики во время пилота на пилотной группе
    x_control - значения ковариант на контрольной группе
    x_pilot - значения ковариант на пилотной группе
    """
    y = np.hstack([y_control, y_pilot])
    x = np.hstack([x_control, x_pilot])
    covariance = np.cov(x, y)[0,1]
    var_x = np.var(x)
    theta = covariance/var_x
    return theta

In [97]:
def check_cuped_test(df_control, df_pilot, covariate_column):
    theta = calculate_theta(
        df_control['metric'], df_pilot['metric'],
        df_control[covariate_column], df_pilot[covariate_column]
    )
    metric_cuped_control = df_control['metric'] - theta*df_control[covariate_column]
    metric_cuped_pilot = df_pilot['metric'] - theta*df_pilot[covariate_column]
    _, pvalue = stats.ttest_ind(metric_cuped_control, metric_cuped_pilot)
    return pvalue

In [105]:
df_pilot = df_exp[df_exp['pilot']==1]
df_control = df_exp[df_exp['pilot']==0]

pvalue = check_cuped_test(
    df_control, 
    df_pilot,
    'cov_4week'
    )
print(f"Result of test: {pvalue:.4f}")

Result of test: 0.0539


In [99]:
# df_check = pd.read_csv('../data/df_metrics_1000.csv')

# check = pd.merge(
#     df_check,
#     df_exp,
#     on='user_id',
#     how='left'
# )

# check['diff'] = check['cov'] - check['cov_4week']
# check[check['diff']>0]

### Task 2. Функция вычисления CUPED-метрики

In [135]:
import numpy as np
import pandas as pd


def calculate_cuped_metric(df_metric, df_cov):
    """Считает значения cuped-метрики.

    :param df_metric (pd.DataFrame): таблица со значениями метрики во время эксперимента
        со столбцами ['user_id', 'metric'].
    :param df_cov (pd.DataFrame): таблица со значениями ковариаты
        со столбцами ['user_id', 'cov'].
    :return df: таблица со значениями cuped-метрики со столбцами ['user_id', 'metric'].
    """

    df = pd.merge(df_metric, df_cov, on='user_id') 
    covariance = df[['metric', 'cov']].cov().iloc[0,1]
    avg_cov = df['cov'].mean()
    var_cov = df['cov'].var(ddof=0)
    theta = covariance/var_cov
    df['metric'] = df['metric'] - theta*(df['cov'] - avg_cov)

    return df[['user_id', 'metric']]

In [136]:
df_metric = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [2000, 2500, 3000]})
df_cov = pd.DataFrame({'user_id': [1, 2, 3], 'cov': [1100, 1500, 0]})
df = calculate_cuped_metric(df_metric, df_cov)
df
# df = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [2159.53, 2933.01, 2407.46]})

Unnamed: 0,user_id,metric
0,1,2159.530387
1,2,2933.01105
2,3,2407.458564
