In [3]:
import pandahouse
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
%matplotlib inline

In [4]:
connection = {'host': '*',
'database':'*',
'user':'*',
'password':'*'
}
ALFA = 0.05

## Линеаризованные лайки для групп 0 (контроль) и 3 (тест)

In [12]:
q = """
SELECT exp_group, 
    user_id,
    sum(action = 'like') as likes,
    sum(action = 'view') as views,
    likes/views as ctr
FROM simulator_20240220.feed_actions 
WHERE toDate(time) between '2024-01-26' and '2024-02-01'
    and exp_group in (0,3)
GROUP BY exp_group, user_id
"""

df1 = pandahouse.read_clickhouse(q, connection=connection)
df1.groupby('exp_group').count()

Unnamed: 0_level_0,user_id,likes,views,ctr
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9920,9920,9920,9920
3,10002,10002,10002,10002


In [13]:
ctr_control = df1[df1.exp_group == 0].likes.sum() / df1[df1.exp_group == 0].views.sum()
df1['linearized_likes'] = df1['likes'] - ctr_control * df1['views']
df1

Unnamed: 0,exp_group,user_id,likes,views,ctr,linearized_likes
0,3,115383,12,44,0.272727,2.767128
1,3,123580,2,11,0.181818,-0.308218
2,0,4944,8,41,0.195122,-0.603358
3,0,4504,5,15,0.333333,1.852430
4,0,121508,6,25,0.240000,0.754050
...,...,...,...,...,...,...
19917,3,13781,10,27,0.370370,4.334374
19918,3,130651,18,98,0.183673,-2.564123
19919,0,3681,7,55,0.127273,-4.541090
19920,3,122008,5,41,0.121951,-3.603358


In [14]:
# Обычный CTR
_, p_value_tt = stats.ttest_ind(df1[df1.exp_group == 0].ctr,
                df1[df1.exp_group == 3].ctr,
                equal_var=False)
print(f'Т-тест {p_value_tt}. Отличие {"не " if p_value_tt > ALFA else ""}прокрасилось')

Т-тест 6.216047483062228e-44. Отличие прокрасилось


In [15]:
# Линеаризованные лайки
_, p_value_tt = stats.ttest_ind(df1[df1.exp_group == 0].linearized_likes,
                df1[df1.exp_group == 3].linearized_likes,
                equal_var=False)
print(f'Т-тест {p_value_tt}. Отличие {"не " if p_value_tt > ALFA else ""}прокрасилось')

Т-тест 1.4918137745326139e-58. Отличие прокрасилось


## P-value стало на 14 порядков меньше

## Линеаризованные лайки для групп 1 (контроль) и 2 (тест)

In [16]:
q = """
SELECT exp_group, 
    user_id,
    sum(action = 'like') as likes,
    sum(action = 'view') as views,
    likes/views as ctr
FROM simulator_20240220.feed_actions 
WHERE toDate(time) between '2024-01-26' and '2024-02-01'
    and exp_group in (1,2)
GROUP BY exp_group, user_id
"""

df2 = pandahouse.read_clickhouse(q, connection=connection)
df2.groupby('exp_group').count()

Unnamed: 0_level_0,user_id,likes,views,ctr
exp_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,10020,10020,10020,10020
2,9877,9877,9877,9877


In [17]:
ctr_control = df2[df2.exp_group == 1].likes.sum() / df2[df2.exp_group == 1].views.sum()
df2['linearized_likes'] = df2['likes'] - ctr_control * df2['views']
df2

Unnamed: 0,exp_group,user_id,likes,views,ctr,linearized_likes
0,1,109963,3,15,0.200000,-0.144062
1,1,26117,32,141,0.226950,2.445813
2,1,138232,18,73,0.246575,2.698896
3,1,18392,7,32,0.218750,0.292667
4,1,26295,39,141,0.276596,9.445813
...,...,...,...,...,...,...
19892,2,17146,0,10,0.000000,-2.096042
19893,1,30872,11,59,0.186441,-1.366646
19894,2,25383,42,127,0.330709,15.380271
19895,2,139020,17,41,0.414634,8.406229


In [18]:
# Обычный CTR
_, p_value_tt = stats.ttest_ind(df2[df2.exp_group == 1].ctr,
                df2[df2.exp_group == 2].ctr,
                equal_var=False)
print(f'Т-тест {p_value_tt}. Отличие {"не " if p_value_tt > ALFA else ""}прокрасилось')

Т-тест 0.685373331140751. Отличие не прокрасилось


In [19]:
# Линеаризованные лайки
_, p_value_tt = stats.ttest_ind(df2[df2.exp_group == 1].linearized_likes,
                df2[df2.exp_group == 2].linearized_likes,
                equal_var=False)
print(f'Т-тест {p_value_tt}. Отличие {"не " if p_value_tt > ALFA else ""}прокрасилось')

Т-тест 2.9805064038667945e-09. Отличие прокрасилось


## В этом тесте линеаризованные лайки показали себя гораздо лучше, сразу прокрасив т-тест.

### Всё можно было красиво причесать и формить классами или хотя бы функциями, но времени катастрофически не хватало:(