# read data

In [1]:
import pandas as pd

In [2]:
from scipy.stats import norm

In [3]:
df = pd.read_csv("AB_test_data.csv")

In [4]:
df.head()

Unnamed: 0,Variant,purchase_TF,date,id
0,A,False,2019-12-26,0x6f9421
1,A,False,2019-08-16,0x59d442
2,A,True,2019-03-18,0x6db8f8
3,A,False,2019-02-13,0x68245d
4,A,False,2019-09-28,0x28566e


# alpha, power  

In [5]:
alpha = 0.05
power = 0.8
norm.ppf(1-alpha)

1.6448536269514722

In [6]:
norm.ppf(1-alpha/2)

1.959963984540054

In [7]:
norm.ppf(1-power)

-0.8416212335729143

In [8]:
df.purchase_TF.value_counts()

False    46416
True      8584
Name: purchase_TF, dtype: int64

# Overview data

In [9]:
df[df.Variant == 'A'].purchase_TF.count()

50000

In [10]:
df[df.Variant == 'B'].purchase_TF.count()

5000

In [11]:
df[df.Variant == 'A'].purchase_TF.value_counts()

False    42397
True      7603
Name: purchase_TF, dtype: int64

In [12]:
df[df.Variant == 'B'].purchase_TF.value_counts()

False    4019
True      981
Name: purchase_TF, dtype: int64

# Calculating the optimal sample size

In [13]:
from numpy import sqrt

In [14]:
alpha = 0.05
power = 0.8
t_alpha_d2 = norm.ppf(1-alpha/2)
t_alpha_d2
t_beta = .84162
p0 = df[df.Variant == 'A'].purchase_TF.sum() / 50000
p1 = df[df.Variant == 'B'].purchase_TF.sum() / 5000
p_bar = (p0  + p1)/2
delta = p1  - p0

In [15]:
t_alpha_d2

1.959963984540054

In [16]:
((t_alpha_d2*sqrt(2*p_bar*(1-p_bar)))+t_beta*sqrt((p0*(1-p0))+(p1*(1-p1))))**2/delta/delta

1157.485452132069

# z_score 

In [17]:
df_A= df[df.Variant=='A']

In [18]:
df_B = df[df.Variant=='B']

In [19]:
z = (0.1962-0.15206) / sqrt(0.15206*(1-0.15206)/5000)
z

8.692151285198767

# Calculating z_score for 10 trials

In [20]:
def z_score(data,number,p=0.15206):
    
    variant_B = data
    
    variant_B_sampled = variant_B.sample(n = number)
    
    p_sample = variant_B_sampled.purchase_TF.sum() / number

    z_score = (p_sample-p) / sqrt(p*(1-p)/number)

    return z_score

In [21]:
z_score(df_B,1157,p=0.15206)

5.572847978502808

In [22]:
z_score(df_B,1157,p=0.15206)

2.625402591558127

In [23]:
z_score(df_B,1157,p=0.15206)

4.508492699883895

In [24]:
z_score(df_B,1157,p=0.15206)

5.245354046620067

In [25]:
z_score(df_B,1157,p=0.15206)

1.970414727792642

In [26]:
z_score(df_B,1157,p=0.15206)

4.4266192169132115

In [27]:
z_score(df_B,1157,p=0.15206)

4.099125285030468

In [28]:
z_score(df_B,1157,p=0.15206)

5.654721461473494

In [29]:
z_score(df_B,1157,p=0.15206)

5.16348056364938

In [30]:
z_score(df_B,1157,p=0.15206)

3.935378319089097

# Sequential probability ratio test (SPRT)

In [31]:
"""
p(xi=1)= 0.15206 under H0
p(xi=1)= 0.1962 under H1
a=.05
b=.2
"""

'\np(xi=1)= 0.15206 under H0\np(xi=1)= 0.1962 under H1\na=.05\nb=.2\n'

In [32]:
import numpy as np
np.log(.2)

-1.6094379124341003

In [33]:
def test(data,number,p_ho,p_h1):
    
    variant_B = data
    
    variant_B_sampled = variant_B.sample(n = number)

    total_sum=0
    i=0
    while -1.6 < total_sum <  2.99:     
        if variant_B_sampled.purchase_TF.iloc[i] == True:
            log = np.log(p_h1/p_ho)
        elif variant_B_sampled.purchase_TF.iloc[i] == False:
            log = np.log((1-p_h1)/(1-p_ho))
        total_sum = total_sum + log
        i = i+1
    return i, total_sum

In [34]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(438, 3.100198002544559)

In [35]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(275, 3.181154252416746)

In [36]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(59, -1.6125105774009008)

In [37]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(485, 3.054156475485399)

In [38]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(248, 3.082964140006827)

In [39]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(392, 3.092780133047136)

In [40]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(88, -1.6212392581041568)

In [41]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(155, 3.1215877975685626)

In [42]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(346, 3.0853622635497095)

In [43]:
test(df_B,1158,p_ho=0.15206,p_h1=0.1962)

(116, 3.0482729160625786)

In [45]:
import numpy as np
p_ho=0.15206
p_h1=0.1962
i=0
total_sum = 0
while -1.6 < total_sum < 2.99:
    if df_B.purchase_TF.iloc[i] == True:
        log = np.log(p_h1/p_ho)
    elif df_B.purchase_TF.iloc[i] == False:
        log = np.log((1-p_h1)/(1-p_ho))
    total_sum = total_sum + log
    print(i, total_sum)
    i = i+1

0 -0.05345939655658821
1 -0.10691879311317642
2 -0.16037818966976464
3 -0.21383758622635285
4 0.04102178110461929
5 0.29588114843559143
6 0.24242175187900322
7 0.188962355322415
8 0.1355029587658268
9 0.08204356220923859
10 0.028584165652650373
11 0.2834435329836225
12 0.2299841364270343
13 0.1765247398704461
14 0.12306534331385788
15 0.06960594675726967
16 0.016146550200681453
17 -0.03731284635590676
18 -0.09077224291249497
19 -0.14423163946908318
20 -0.1976910360256714
21 -0.2511504325822596
22 -0.3046098291388478
23 -0.35806922569543603
24 -0.41152862225202425
25 -0.1566692549210521
26 -0.21012865147764032
27 -0.26358804803422853
28 -0.31704744459081674
29 -0.37050684114740495
30 -0.42396623770399317
31 -0.4774256342605814
32 -0.5308850308171695
33 -0.5843444273737577
34 -0.6378038239303458
35 -0.691263220486934
36 -0.7447226170435222
37 -0.7981820136001103
38 -0.8516414101566985
39 -0.9051008067132866
40 -0.9585602032698748
41 -1.012019599826463
42 -1.065478996383051
43 -1.11893839