In [2]:
#https://www.kaggle.com/datasets/zhangluyuan/ab-testing

In [3]:
import math

import pandas as pd
import numpy  as np

from statsmodels.stats import api as sms

# 1. Load Data

In [4]:
df_raw = pd.read_csv( 'data/ab_data.csv' )
df_raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


# 2.2. Experiment Parameters

In [5]:
# confidence level
conf_level = 0.95

# significance level
sig_level = 1 - conf_level

# effect size
p1 = 0.13
p2 = 0.15

eff_size = sms.proportion_effectsize(p1, p2)

# Statistical power
power = 0.8

In [6]:
# sample size
sample_n = sms.NormalIndPower().solve_power(
    effect_size=eff_size,
    power=power,
    alpha=sig_level
)

sample_n = math.ceil( sample_n )
print( 'The sample size for the control and tratment group is {}.'.format(sample_n) )
print( 'The total sample size is {}.'.format(2*sample_n) )

The sample size for the control and tratment group is 4720.
The total sample size is 9440.


# 3. Data Preparation

## 3.1. Descriptive Analysis

### 3.1.1. Data Dimenstions

In [7]:
print( 'Number of rows: {}'.format(df_raw.shape[0]) )
print( 'Number of columns: {}'.format(df_raw.shape[1]) )

Number of rows: 294478
Number of columns: 5


### 3.1.2. Check NA

In [8]:
df_raw.isna().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

### 3.1.3. Check Flags

In [9]:
# Number of users by group/landing_page
df_raw[['user_id', 'group', 'landing_page']].groupby( ['group', 'landing_page'] ).count().reset_index()

Unnamed: 0,group,landing_page,user_id
0,control,new_page,1928
1,control,old_page,145274
2,treatment,new_page,145311
3,treatment,old_page,1965


In [10]:
# Number of duplicated users
df_raw[['user_id', 'group']].groupby( 'user_id' ).count().reset_index().query( 'group > 1' )

Unnamed: 0,user_id,group
43,630052,2
111,630126,2
122,630137,2
281,630320,2
420,630471,2
...,...,...
290244,945627,2
290259,945645,2
290315,945703,2
290399,945797,2


In [11]:
# delete duplicated users
df_user_delete = df_raw[['user_id', 'group']].groupby( 'user_id' ).count().reset_index().query( 'group > 1' )['user_id']

df1 = df_raw[~df_raw['user_id'].isin( df_user_delete )]
df1.shape

(286690, 5)

In [12]:
# Number of users by group/landing_page
df1[['user_id', 'group', 'landing_page']].groupby( ['group', 'landing_page'] ).count().reset_index()

Unnamed: 0,group,landing_page,user_id
0,control,old_page,143293
1,treatment,new_page,143397


## 3.2. Random Sampling

In [13]:
# control group
df_control_sample = df1[df1['group'] == 'control'].sample( n=sample_n, random_state=42 )
print(  'Size of Control Group: {}'.format( df_control_sample.shape[0] ) )

# treatment group
df_treatment_sample = df1[df1['group'] == 'treatment'].sample( n=sample_n, random_state=42 )
print(  'Size of Treatment Group: {}'.format( df_treatment_sample.shape[0] ) )

# concat
df_ab = pd.concat( [df_control_sample, df_treatment_sample] ).reset_index( drop=True )

Size of Control Group: 4720
Size of Treatment Group: 4720


# 3.3. Metric of interest

In [16]:
# conversion rate control group
sales = df_control_sample.loc[df_control_sample['converted'] == 1, 'converted'].sum()
visit = df_control_sample.shape[0]

conversion_rate_control = sales / visit
print( 'Conversion rate - Control Group: {:.4f}'.format( conversion_rate_control ) )

Conversion rate - Control Group: 0.1155


In [18]:
# conversion rate treatment group
sales = df_treatment_sample.loc[df_treatment_sample['converted'] == 1, 'converted'].sum()
visit = df_treatment_sample.shape[0]

conversion_rate_treatment = sales / visit
print( 'Conversion rate - Treatment Group: {:.4f}'.format( conversion_rate_treatment ) )

Conversion rate - Treatment Group: 0.1290
