In [1]:
import pandas as pd
import numpy as np

# 1. Генерация данных (как в вашем примере)
users_num = 10000

df = pd.DataFrame()
df['user'] = range(users_num)
df['group'] = np.random.rand(users_num) < 0.5

df['user_mean'] = np.random.lognormal(mean=np.log(1000), sigma=0.5, size=users_num)
df['cost_before'] = np.abs(
    df['user_mean'] + np.random.normal(0, 100, size=users_num)
)
df['cost'] = np.abs(
    df['user_mean'] + np.random.normal(50, 100, size=users_num)
)


In [2]:
# 2. Расчет параметра theta для CUPED
covariance = df['cost_before'].cov(df['cost'])
variance_before = df['cost_before'].var()
theta = covariance / variance_before

In [3]:
df['cost_cuped'] = df['cost'] - theta * df['cost_before']

In [4]:
variance_cost = df['cost'].var()
variance_cuped = df['cost_cuped'].var()
reduction_factor = variance_cost / variance_cuped

In [5]:
reduction_factor

18.890808402708796

In [6]:
import hashlib

# 1. Генерация данных (как в вашем примере)
np.random.seed(6)

users_num = 10000
mean_user_ctr = 0.2
beta = 20
alpha = mean_user_ctr * beta / (1 - mean_user_ctr)

df = pd.DataFrame()
df['user'] = range(users_num)
df['group'] = np.random.rand(users_num) < 0.5

df['base_user_ctr'] = np.random.beta(alpha, beta, size=users_num)
df['views'] = np.random.lognormal(mean=1, sigma=1, size=users_num).astype(int) + 1
df['clicks'] = np.random.binomial(df['views'], df['base_user_ctr'])

In [7]:
buckets_num = 100
salt = 'my_salt'

def get_bucket(user_id, salt, buckets_num):
    user_str = str(user_id) + salt
    hash_object = hashlib.md5(user_str.encode())
    hex_dig = hash_object.hexdigest()
    bucket = int(hex_dig, 16) % buckets_num
    return bucket

In [8]:
df['bucket'] = df['user'].apply(lambda user_id: get_bucket(user_id, salt, buckets_num))

In [9]:
# Групповой CTR (агрегированный по бакету)
bucket_data = df.groupby('bucket').agg({'clicks': 'sum', 'views': 'sum'})
bucket_data['group_ctr'] = bucket_data['clicks'] / bucket_data['views']

In [10]:
# Обычный средний CTR пользователей в бакете
user_ctr = df.groupby('user').agg({'clicks': 'sum', 'views': 'sum'})
user_ctr['user_ctr'] = user_ctr['clicks'] / user_ctr['views']
df = df.merge(user_ctr['user_ctr'], left_on='user', right_index=True)

In [11]:
bucket_user_ctr = df.groupby('bucket')['user_ctr'].mean()
bucket_data['avg_user_ctr'] = bucket_user_ctr

In [12]:
# Фильтрация по группе
bucket_data_group = bucket_data[df.groupby('bucket')['group'].first() == True]

In [13]:
std_group_ctr = bucket_data_group['group_ctr'].std()
std_avg_user_ctr = bucket_data_group['avg_user_ctr'].std()

In [14]:
std_group_ctr, std_avg_user_ctr

(0.01636165410394264, 0.025414323475555528)