In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pymysql
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

In [2]:
con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='lxy021527', db='ABTest', use_unicode=True, charset='utf8')
data = pd.read_sql('select * from ab_data', con=con)
data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [3]:
# 新增一列，将日期修改为'%Y-%m-%d'格式
import datetime
data['date'] = pd.to_datetime(data['timestamp']).dt.strftime('%Y-%m-%d')
data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,date
0,851104,2017-01-21 22:11:48.556739,control,old_page,0,2017-01-21
1,804228,2017-01-12 08:01:45.159739,control,old_page,0,2017-01-12
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0,2017-01-11
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0,2017-01-08
4,864975,2017-01-21 01:52:26.210827,control,old_page,1,2017-01-21


# 检验指标确定

**一类指标 人均停留时长，保证一类指标不下降或在正常波动范围内**

**二类指标 banner位点击率**

# 确定检验统计量

**一类指标统计量 人均停留时长之差**

**二类指标统计量 广告点击率 = 点击人数/曝光人数， 点击率之差**

# 埋点收集数据

**在广告位置 曝光和点击收集**

|埋点事件|Banner曝光|Banner点击|用户页面停留时长|
--|--|--|--|
常规埋点属性|事件ID|Confirm_id|
--|事件位置|Page_id|
--|--|Block_id|
--|--|Seat_id|
--|来源页面|From|
--|URL|URL|
--|用户属性|User_id|
--|--|用户设备
--|--|用户位置
特殊属性|曝光|点击|停留时长
--|--|--|开始时间
--|--|--|结束时间

# 确定$H_0,H_1$

**一类指标**

**$H_0$ : control_stime - treatment_stime >= 2*std(control_stime)**

**$H_1$ : control_stime - treatment_stime < 2*std(control_stime)**

**二类指标**

**$H_0$ : treatment_p - control_p <= 0**

**$H_1$ : treatment_p - control_p > 0**

# 确定显著性水平α

**一类错误使用默认值α=0.05**

**二类错误使用默认值β=0.2**

# 样本量计算

一类指标无数据，不计算

二类指标为单侧（右侧）检验，样本量计算公式为：

**$n = p_0(1-p_0)*np.power((z_{1-α} + z_{1-β}*np.sqrt(p(1-p)/p_0(1-p_0)))/(p-p_0),2)$**

**其中，treatmnet_p 为p，control_p为$p_0$**

In [4]:
# 根据常规默认值确定 α，β，k值 α=0.05，β=0.2
alpha = 0.05
beta = 0.2

求$z_{1-α}$和$z_{1-β}$

In [24]:
z_alpha = stats.norm.isf(alpha, loc=0, scale=1)
z_beta = stats.norm.isf(beta, loc=0, scale=1)
print(z_alpha, z_beta)

1.6448536269514729 0.8416212335729142


求$p_0$

In [20]:
# 查看数据类型,converted一列为object型，将其转换为int型
data.converted = data.converted.astype('int')
print(data.dtypes)

user_id         object
timestamp       object
group           object
landing_page    object
converted        int32
date            object
dtype: object


In [22]:
# 计算对照组的广告点击率（由于converted值为0，1），所以所求比例与均值结果一致
control_p = data.converted[(data.group=='control')&(data.landing_page=='old_page')].mean()
control_p

0.1203863045004612

In [23]:
# 计算p0*(1-p0)
control_p_1 = control_p * (1 - control_p)
control_p_1

0.10589344218918344

In [25]:
# H1为treatment_p - control_p > 0,为计算样本量，这里需要指定一个非0的值，此值的绝对值越小样本量越大
# treatment_p - control_p > 0.01
treatment_p = control_p + 0.01
treatment_p

0.1303863045004612

In [26]:
# 计算p * (1-p)
treatment_p_1 = treatment_p * (1- treatment_p)
treatment_p_1

0.11338571609917422

**计算样本量n**

In [28]:
n = control_p_1 * np.power((z_alpha + z_beta * np.sqrt(treatment_p_1/control_p_1))/(treatment_p-control_p),2)
n

6701.938803160921

In [31]:
# 查看我们的样本量是否满足最小样本量
data.groupby(['group','landing_page'])['user_id'].count()

group      landing_page
control    new_page          1928
           old_page        145274
treatment  new_page        145311
           old_page          1965
Name: user_id, dtype: int64

# 假设检验

1. 计算统计量
2. 计算统计量的显著性P值
3. 用统计量的显著性P值与显著性α比较做决策

In [35]:
df = data[data.date=='2017-01-18'].groupby(['group','landing_page'], as_index=False)['converted'].mean()
df

Unnamed: 0,group,landing_page,converted
0,control,new_page,0.114583
1,control,old_page,0.124807
2,treatment,new_page,0.124792
3,treatment,old_page,0.105769


In [49]:
# 只需要旧页面控制组的比例与新页面实验组的比例
statistic_t = df.converted[2] - df.converted[1]

求$σ= np.sqrt(p_0*(1-p_0)/n_0 + p*(1-p)/n)$

In [50]:
# 求控制组和实验组的size
n1 = data.loc[(data.date=='2017-01-18') & (data.group=='control')&(data.landing_page=='old_page')]['converted'].count()
n2 = data.loc[(data.date=='2017-01-18') & (data.group=='treatment')&(data.landing_page=='new_page')]['converted'].count()
display(n1, n2)

6482

6603

In [51]:
# 求σ
sigema = np.sqrt(df.converted[1]*(1-df.converted[1])/n1 + df.converted[2]*(1-df.converted[2])/n2)
sigema

0.005778590893820374

H0决定拒绝域，<=则说明拒绝域在右侧，所以，使用stats.norm.cdf求出左侧累积概率

H0: treatment_p - control_p <= 0

In [52]:
statistic_p = 1 - stats.norm.cdf(statistic_t, 0, sigema)
statistic_p

0.5010629741266761

In [53]:
if (statistic_p > alpha):
    print('显著性P > α 实验组点击率 <= 对照组点击率')
else:
    print('显著性P < α 实验组点击率 > 对照组点击率')

显著性P > α 实验组点击率 <= 对照组点击率


In [79]:
# 封装如下：
def ABTest_P(df: pd.DataFrame, group_col: str=None, value_col: str=None, alpha: float=0.05):
    '''
    :param df: 被分析的DataFrame对象
    :param alpha: 显著性 
    :param group_col: 组列的名字，默认为df的第一列
    :param value_col:值列的名字，默认为df的第二列
    :return: tongjiliang p_value p_type
    '''
    # 列名
    if not group_col:
        group_col = df.columns[0]
    if not value_col:
        value_col = df.columns[1]
        
    temp = df.groupby(group_col, as_index=False)[value_col].mean()
    temp_n = df.groupby(group_col, as_index=False)[value_col].count()
    
    tongjiliang = temp.iloc[0,1] - temp.iloc[1,1]
    diff_error = np.sqrt(temp.iloc[0,1]*(1-temp.iloc[0,1])/temp_n.iloc[0,1] + temp.iloc[1,1]*(1-temp.iloc[1,1])/temp_n.iloc[1,1])
    
    tongjiliang_left_p = stats.norm.cdf(tongjiliang, 0, diff_error)
    tongjiliang_right_p = 1 - stats.norm.cdf(tongjiliang, 0, diff_error)
    tongjiliang_site_p = tongjiliang_left_p * 2
    if tongjiliang_site_p > 1:
        tongjiliang_site_p = tongjiliang_right_p * 2
    
    # 在封装的时候，左右侧双侧检验都用上，提高通用性
    temp_1 = [[temp.iloc[0,0], temp.iloc[1,0], tongjiliang, '左侧', tongjiliang_left_p, np.where(tongjiliang_left_p < alpha, '显著', '不显著')],
             [temp.iloc[0,0], temp.iloc[1,0], tongjiliang, '右侧', tongjiliang_right_p, np.where(tongjiliang_right_p < alpha, '显著', '不显著')],
             [temp.iloc[0,0], temp.iloc[1,0], tongjiliang, '双侧', tongjiliang_site_p, np.where(tongjiliang_site_p < alpha, '显著', '不显著')]]
    
    temp = pd.DataFrame(temp_1, columns=['p', 'p0', '统计量', '检测', '显著性p值', '结果'])
    
    return temp

In [80]:
temp = data[data.date=='2017-01-18'].loc[((data.group=='control')&(data.landing_page=='old_page'))|((data.group=='treatment')&(data.landing_page=='new_page')),['group','converted']]
temp.head()

Unnamed: 0,group,converted
10,treatment,0
45,control,0
51,control,1
83,control,0
89,control,1


In [81]:
ABTest_P(temp)

Unnamed: 0,p,p0,统计量,检测,显著性p值,结果
0,control,treatment,1.5e-05,左侧,0.501063,不显著
1,control,treatment,1.5e-05,右侧,0.498937,不显著
2,control,treatment,1.5e-05,双侧,0.997874,不显著
