In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pymysql
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

In [3]:
con = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='lxy021527', db='ABTest', use_unicode=True, charset='utf8')
data = pd.read_sql('select * from abtest', con=con)
data.head()

Unnamed: 0,id,组,曝光量,点击量,下单量,购买店铺数,店铺类型,riqi
0,17437864,A,105,16,15,20,A,2020-05-13
1,17402857,A,147,21,13,7,A,2020-05-13
2,17017891,A,59,10,9,4,A,2020-05-13
3,15792190,A,216,39,6,2,C,2020-05-13
4,8726788,A,51,9,6,1,A,2020-05-13


# 一类指标假设检验

In [4]:
# 抽取A类店铺一天数据
df = data.loc[(data.riqi=='2020-05-14') & (data.店铺类型 == 'A'),['组','下单量']]
df.head()

Unnamed: 0,组,下单量
62219,A,18
62220,A,15
62221,A,11
62222,A,10
62223,A,7


In [5]:
# 查看每组样本量
df.groupby('组')['下单量'].count()

组
A    10000
B     9468
C     1038
D    10532
Name: 下单量, dtype: int64

In [6]:
# C组样本悬殊，故C组不参与检验
df = df[df.组 != 'C']
df.组.unique()

array(['A', 'D', 'B'], dtype=object)

In [7]:
df.groupby('组')['下单量'].mean()

组
A    0.119500
B    0.705957
D    0.805640
Name: 下单量, dtype: float64

In [11]:
# 统计对照组（D组）的标准差
df_std = data.loc[data.店铺类型=='A', ['组','riqi','下单量']]
std = df_std[df_std.组=='D'].groupby('riqi')['下单量'].mean().std()
std
# 正态分布的经验解释是一个标准差内包含68.2%的数据，2个标准差内是95.5%的数据，3个标准差内是99.7%的数据
# 数据在3个标准差内波动为正常现象

0.09789082484362138

In [12]:
# 为让A类店铺无明显感知利润下降，我们设置A类店铺下降阈值为2个标准差
muzhicha = std*2
muzhicha

0.19578164968724276

## 求AD组

In [13]:
alpha = 0.05
# H0: μD - μA >= 0.195, H1: μD - μA < 0.195

计算统计量 $xbar_D - xbar_A$

In [14]:
dif_AD = df.下单量[df.组=='D'].mean() - df.下单量[df.组=='A'].mean()
dif_AD

0.6861399544246107

求$s^2_A/n_A + s^2_D/n_D$

In [15]:
varsum_AD = df.下单量[df.组=='A'].var()/df.下单量[df.组=='D'].count() + df.下单量[df.组=='D'].var()/df.下单量[df.组=='D'].count()
varsum_AD

0.0004805813046421241

$xbar_D - xbar_A$ ~ N(0.195, varsum_AD)

计算dif_AD在相应的分布的概率p

In [16]:
p_A = stats.norm.cdf(dif_AD, loc=muzhicha, scale=np.sqrt(varsum_AD))
p_A

1.0

## 求BD组

In [18]:
dif_BD = df.下单量[df.组=='D'].mean() - df.下单量[df.组=='B'].mean()
varsum_BD = df.下单量[df.组=='B'].var()/df.下单量[df.组=='B'].count() + df.下单量[df.组=='D'].var()/df.下单量[df.组=='D'].count()
p_B = stats.norm.cdf(dif_BD, loc=muzhicha, scale=np.sqrt(varsum_BD))
p_B

0.00014563245658988502

判断结果

In [19]:
if (p_A < alpha) & (p_B < alpha):
    if dif_A < dif_B:
        print('A策略对A类店铺影响小')
    else:
        print('B策略对A类店铺影响小')
elif p_A < alpha:
    print('A策略对A类店铺影响小于阈值' + str(muzhicha))
elif p_B < alpha:
    print('B策略对A类店铺影响小于阈值' + str(muzhicha))
else:
    print('A,B策略对A类店铺影响超过阈值' + str(muzhicha))

B策略对A类店铺影响小于阈值0.19578164968724276


# 二类指标假设检验

In [20]:
df_C = data.loc[(data.riqi=='2020-05-14') & (data.店铺类型=='C'), ['组', '下单量']]
df_C.head()

Unnamed: 0,组,下单量
62229,A,4
62258,A,2
62260,A,2
62352,A,1
62354,A,1


In [22]:
df_C.groupby('组')['下单量'].count()

组
A    10000
B    10000
C     1183
D    10000
Name: 下单量, dtype: int64

In [24]:
# 去除C组，未达到样本量
df_C = df_C[df_C['组'] != 'C']

In [25]:
df_C.groupby('组')['下单量'].mean()

组
A    0.0064
B    0.4697
D    0.1318
Name: 下单量, dtype: float64

计算提升值 $μ_A -μ_B$

In [26]:
tisheng = data[(data.组=='D') & (data.店铺类型=='C')].groupby('riqi')['下单量'].mean().mean()*0.3
tisheng

0.031065555555555555

ABTest封装函数

In [37]:
def abtest(df: pd.DataFrame, alpha=0.05, group_col: str=None, value_col: str=None):
    '''
    :param df: 被分析DataFrame对象
    :param alpha: 临界值
    :param group_col: 组列的%%latex字，默认为df的第一列
    :param value_col: 值列的名字，默认为df的第2列
    :return:best_group_name,pdf
        best_group_name: 最优解
        pdf: 最优组与其他组的差异性
    '''
    # 列名
    if not group_col:
        group_col = df.columns[0]
    if not value_col:
        value_col = df.columns[1]
        
    # 寻找最优组与最优质值
    best_group_name = df.groupby(group_col)[value_col].mean().sort_values(ascending=False).index.tolist()[0]
    best_group_values = df[df[group_col] == best_group_name][value_col]  # 最优组的values
    # 去除最优组的组名
    group_names = list(set(df[group_col].unique().tolist()) - set(best_group_name))
    # 初始化返回数据
    pdf = pd.DataFrame(columns=[group_col, 'mean', 'pvalue', 'ptype'])
    # 计算差异性
    for group_name in group_names:
        group_values = df[df[group_col] == group_name][value_col]
        
        dif = best_group_values.mean() - group_values.mean()
        var = best_group_values.var()/best_group_values.count() + group_values.var()/group_values.count()
        
        # Z检验
        pvalue = 1 - stats.norm.cdf(dif, loc=tisheng, scale=np.sqrt(var))
        
        if pvalue >= alpha:
            ptype = '无显著差异'
        else:
            ptype = '有显著差异'
        # 添加数据
        pdf.loc[pdf.shape[0]] = {group_col:group_name, 'mean':group_values.mean(), 'pvalue':pvalue, 'ptype':ptype}
        
    return best_group_name, best_group_values.mean(), pdf

In [38]:
abtest(df_C)

('B',
 0.4697,
    组    mean  pvalue  ptype
 0  D  0.1318     0.0  有显著差异
 1  A  0.0064     0.0  有显著差异)