In [1]:
#load packages:
import pandas as pd
import numpy as np
import scipy.stats as ss
import math as mt
import itertools

In [2]:
#let's have a look at our dataset:
file_path = r"C:\Users\jiao_\Desktop\ab_data.csv"
data = pd.read_csv(file_path)
df = data.copy()
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [3]:
df.info()
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB
(294478, 5)


Data Cleaning

In [4]:
#analyse data      unique user's conversion rates given their unique landing page
df['user_id'].nunique()

290584

In [5]:
#Locate where treatment does not match with new_page or control does not match with old_page, and drop these rows
i = df[((df['group']=='treatment') ==(df['landing_page']=='new_page')) == False].index
df2 = df.drop(i)

In [6]:
df2.shape[0]

290585

In [7]:
#If the number of unique rows is 1 greater than the number of unique users, then we have a duplicate user somewhere. We'll find the duplicate row first:
df2[df2.duplicated(['user_id'], keep=False)]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [8]:
#drop duplicate:
df2.drop_duplicates(subset ='user_id',keep ='first',inplace = True)

Probabilities

In [9]:
#total/pooled probability of conversion:
P_pool = (df2.query('converted == 1').converted.count())/df2.shape[0]
P_pool

0.11959708724499628

In [10]:
#probability of conversion given a user was in the control group:
control_df = df2.query('group =="control"')
P_old = control_df['converted'].mean()
P_old

0.1203863045004612

In [11]:
#probability of conversion given a user was in the treatment group:
treatment_df = df2.query('group =="treatment"')
P_new = treatment_df['converted'].mean()
P_new

0.11880806551510564

We can quickly observe that our new page isn't doing too hot on conversion improvement. In fact, it's doing slightly worse than the control group!

Hypothetically we could end the test here, but there are a few checks and measures we must do before and after an A/B test to ensure our experiment as run properly.

In [12]:
#proportion of users seeing the new vs old page:
N_new = df2.query('landing_page == "new_page"').landing_page.count()
N_old = df2.query('landing_page == "old_page"').landing_page.count()
proportion = (N_old/df2.shape[0],N_new/df2.shape[0])
proportion

(0.4999380557773312, 0.5000619442226688)

In [13]:
#function for getting z-scores for alpha. For our experiemnt where alpha = 5%, keep in mind we want to input 1-alpha/2 for Confidence Intervals.
def get_z_score(alpha):
    return ss.norm.ppf(alpha)

In [14]:
#Guardrail Check on differences in proportions:
sd = round(mt.sqrt((0.5*(1-0.5))/df2.shape[0]),4)
CI = (0.5 - sd*get_z_score(1-0.05/2), 0.5 + sd*get_z_score(1-0.05/2))
print('Does the control group proportion ' + str(N_old/df2.shape[0]) + ' lie within ' + str(CI) + '?')

Does the control group proportion 0.4999380557773312 lie within (0.49823603241391395, 0.5017639675860861)?


This means that we've passed our Guardrail Metric that the number of unique users is equal for each group.

In [19]:
import statsmodels.stats.api as sms
from math import ceil

effect_size = sms.proportion_effectsize(0.12, 0.15)    # Calculating effect size based on our expected rates

required_n = sms.NormalIndPower().solve_power(
    effect_size, 
    power=0.9,  #power of a test (defualt is 0.8)
    alpha=0.05, #p-value
    ratio=1
    )                                                  # Calculating sample size needed

required_n = ceil(required_n)                          # Rounding up to next whole number                          

print(required_n)

2719


In [15]:
#1. Using statistical rule of thumb to calculate minimum sample size per variation:
16*(0.12*(1-0.12))/pow(0.0035,2)

137926.53061224488

In [16]:
#2.calculating the minimum sample size for the ab test:
def get_sampSize(sds,alpha,beta,d):
    n=pow((get_z_score(1-alpha/2)*sds[0]+get_z_score(1-beta)*sds[1]),2)/pow(d,2)
    return n

In [17]:
#baseline + expected change standard deviation calculations
def get_sds(p,d):
    sd1=mt.sqrt(2*p*(1-p))
    sd2=mt.sqrt(p*(1-p)+(p+d)*(1-(p+d)))
    sds=[sd1,sd2]
    return sds

In [18]:
#2. Using Evan Miller's Calculator but deriving the values ourselves:
round(get_sampSize(get_sds(0.12, 0.0035),0.05,0.2,0.0035))

135830

If using Evan Miller's calculator, the minimum sample size per group = 135,830 users/variation. Given we have 2 groups (treatment and control): the total minimum sample size = 271,660 users. Since we have a total sample size of 290,584, our a/b test will have enough statistical power and significance.

In [19]:
CI_old = (P_old - get_z_score(1-0.025/2)*mt.sqrt(P_old*(1-P_old)/N_old),P_old + get_z_score(1-0.025/2)*mt.sqrt(P_old*(1-P_old)/N_old))
CI_new = (P_new - get_z_score(1-0.025/2)*mt.sqrt(P_new*(1-P_new)/N_new),P_new + get_z_score(1-0.025/2)*mt.sqrt(P_new*(1-P_new)/N_new))
print('Do ' + str(CI_old) + ' and ' + str(CI_new) + ' overlap?') 

Do (0.11847266343679363, 0.12229994556412876) and (0.11690554055275011, 0.12071059047746117) overlap?


Both CI intervals overlap plenty as  CInew is completely contained within  CIold, which means we do not reject the Null Hypothesis that  Pnew=Pold.

In [20]:
#2. z-test
import statsmodels.api as sm
#returning the total number of conversions for each group:
convert_old = df2.query("landing_page == 'old_page' and converted == 1").shape[0]
convert_new = df2.query("landing_page == 'new_page' and converted == 1").shape[0]

In [21]:
#calculating the z-score + p-value using the z-test (one-sided):
z_score, p_value = sm.stats.proportions_ztest([convert_old, convert_new], [N_old, N_new], alternative='smaller')
z_score, p_value

(1.3109241984234394, 0.9050583127590245)

Given our p-value ≈ 0.9 > 0.05, we do not reject the Null Hypothesis.

This means that the new page is not better than the old page.

In [22]:
SE_new = mt.sqrt(P_new*(1-P_new)/N_new)
SE_old = mt.sqrt(P_old*(1-P_old)/N_old)

In [23]:
p = 1 - ss.f.cdf(pow(SE_new,2)/pow(SE_old,2), N_new - 1, N_old - 1)
p

0.986811599380792