In [2]:
# import libraries
import pandas as pd
import numpy as np

In [3]:
# read in the data
df = pd.read_csv("AB_test_data.csv")

In [4]:
# inspect data
df.head()

Unnamed: 0,Variant,purchase_TF,date,id
0,A,False,2019-12-26,0x6f9421
1,A,False,2019-08-16,0x59d442
2,A,True,2019-03-18,0x6db8f8
3,A,False,2019-02-13,0x68245d
4,A,False,2019-09-28,0x28566e


In [5]:
df['purchase_TF'].value_counts()

False    46416
True      8584
Name: purchase_TF, dtype: int64

In [6]:
df['Variant'].value_counts()

A    50000
B     5000
Name: Variant, dtype: int64

Hypothesis setup:
    - Null Hypothesis: Variant B and Variant A had the same conversion rates
    - Alternative hypothesis: Variant B had a higher conversion rate than Variant A

##### Conducting the test

In [7]:
# calculate our z score
p_treatment = df[df["Variant"]=="B"]['purchase_TF'].sum()/len(df[df["Variant"]=="B"]['purchase_TF'])
p_varA = df[df["Variant"]=="A"]['purchase_TF'].sum()/len(df[df["Variant"]=="A"]['purchase_TF'])
n = len(df[df["Variant"]=="B"]['purchase_TF'])
z = (p_treatment-p_varA)/(((p_varA*(1-p_varA))/n)**0.5)
if z > 1.64:
    print("We reject the null. The conversion rate of variant B is significantly higher than that of variant A.")
    print("Our Z score is {}.".format(z))
else:
    print("Test failed. The old version is not that different from the new in terms of conversion rate.")

We reject the null. The conversion rate of variant B is significantly higher than that of variant A.
Our Z score is 8.692151285198767.


With 95% confidence level, Z0.05 = 1.64. Reject null if z > 1.64. Since z is 8.7, **we reject the null hypothesis and conclude that at 95% confidence level, variant B generates more conversion than variant A.**

##### Optimal Sample Size

In [8]:
# Calculate optumal sample size
t_alpha = 1.96
t_beta = 0.842
p0 = p_varA
p1 = p_treatment
delta = p1-p0
p_bar = (p0+p1)/2

# plug into the formula
n_star = ((t_alpha*((2*p_bar*(1-p_bar))**.5)+(t_beta*((p0*(1-p0)+p1*(1-p1))**.5)))**2)*(delta**-2)
print("The optimal sample size for each segment is {}".format(n_star))

The optimal sample size for each segment is 1157.8288770933054


In [9]:
# seperate treatment and control groups
A = df[df["Variant"]=="A"]
B = df[df["Variant"]=="B"]

In [17]:
# test using 1-sample 

log = []
for i in range(10):
    n = 1158
    sample_A = A.sample(n=n)
    sample_B = B.sample(n=n)

    convA = sample_A['purchase_TF'].sum()/n
    convB = sample_B['purchase_TF'].sum()/n

    z_sample = (convB-p_varA)/(((p_varA*(1-p_varA))/n)**0.5)
    if z_sample >= 1.64:
#         print("We reject the null hypothesis. Challenger wins!")
        log.append(1)
    else:
#         print("Old version wins!")
        log.append(0)
        
        
print("The challenger wins {}% of the time.".format(sum(log)/len(log)*100))

The challenger wins 100.0% of the time.


In [16]:
# test using 2-sample

log = []
for i in range(100):
    n = 1158
    sample_A = A.sample(n=n)
    sample_B = B.sample(n=n)

    convA = sample_A['purchase_TF'].sum()/n
    convB = sample_B['purchase_TF'].sum()/n
    p_bar_test = (convA+convB)/2

    z_sample = (convB-convA)/(((p_bar_test*(1-p_bar_test))*(2/n)**0.5))
    
    if z_sample >= 1.64:
#         print("We reject the null hypothesis. Challenger wins!")
        log.append(1)
    else:
#         print("Old version wins!")
        log.append(0)
        
        
print("The challenger wins {}% of the time.".format(sum(log)/len(log)*100))

The challenger wins 98.0% of the time.


##### Sequential Testing

Assume P(Xi=1) under H0 = p-varA and P(Xi=1) under H1 = p-treatment.

Set desired type 1 error = 5% and type 2 error = 20%.

In [100]:
# generate samples
n = 1158
n_trials = 10
alpha = .05
beta = .2
min_diff = p_treatment-p_varA
upper_bound = np.log(1/alpha)
lower_bound = np.log(beta)

# test each observation in the sample:
list_of_trials = []
number_of_success = 0
for j in range(n_trials):
    sample_B = B.sample(n=n)
    log_lambda_n = 0
    for i in range(len(sample_B)):          
        if sample_B['purchase_TF'].iloc[i] == True:
            log_lambda_xi = np.log(p_treatment/p_varA)
#             print("positive: {}".format(log_lambda_xi))                  
        else:
            log_lambda_xi = np.log((1-p_treatment)/(1-p_varA))
#             print("negative: {}".format(log_lambda_xi))

        log_lambda_n += log_lambda_xi
#         print(log_lambda_n)

        if log_lambda_n > upper_bound:
#             print("Accept H1 after {} trials.".format(i+1))
            number_of_success += 1
            break 
        elif log_lambda_n < lower_bound:
#             print("Accept Null after {} trials.".format(i+1))
            break
#         else:
#             print(i,"We have reached the end of the trial")
            
    list_of_trials.append(i+1)
    
print("Success rate is {}%".format(number_of_success/n_trials*100))
    

Success rate is 80.0%


In [101]:
np.mean(list_of_trials)

239.1

In [102]:
list_of_trials

[522, 330, 194, 71, 226, 290, 273, 145, 201, 139]