In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data into DataFrame
df= pd.read_csv('data_abtest(1).csv')
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,0,851104,2021-10-21 22:11:48.556739,control,old_page,0
1,1,804228,2021-10-12 08:01:45.159739,control,old_page,0
2,2,661590,2021-10-11 16:55:06.154213,treatment,new_page,0
3,3,853541,2021-10-08 18:28:03.143765,treatment,new_page,0
4,4,864975,2021-10-21 01:52:26.210827,control,old_page,1


# Data cleaning

In [3]:
df.shape

(294478, 6)

In [4]:
df.user_id.nunique()

290584

In [5]:
# Duplicated id
df[df.user_id.duplicated(keep=False)].sort_values(by = 'user_id').head(10)

Unnamed: 0.1,Unnamed: 0,user_id,timestamp,group,landing_page,converted
230259,230259,630052,2021-10-17 01:16:05.208766,treatment,new_page,0
213114,213114,630052,2021-10-07 12:25:54.089486,treatment,old_page,1
22513,22513,630126,2021-10-14 13:35:54.778695,treatment,old_page,0
251762,251762,630126,2021-10-19 17:16:00.280440,treatment,new_page,0
183371,183371,630137,2021-10-20 02:08:49.893878,control,old_page,0
11792,11792,630137,2021-10-22 14:59:22.051308,control,new_page,0
207211,207211,630320,2021-10-07 18:02:43.626318,control,old_page,0
255753,255753,630320,2021-10-12 05:27:37.181803,treatment,old_page,0
96929,96929,630471,2021-10-07 02:14:17.405726,control,new_page,0
110634,110634,630471,2021-10-23 01:42:51.501851,control,old_page,0


In [6]:
# Treatment-new page; control-old page
mismatch = ((df['group'] == 'treatment') != (df['landing_page'] == 'new_page'))
print('Mismatch：', mismatch.sum())

Mismatch： 3893


In [7]:
matched_df = df[~mismatch].copy()

In [8]:
print('Number of dataset', matched_df.shape[0])
print('Number of User ID', matched_df.user_id.nunique())

Number of dataset 290585
Number of User ID 290584


In [9]:
matched_df[matched_df.user_id.duplicated(keep=False)]

Unnamed: 0.1,Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,1899,773192,2021-10-09 05:37:58.781806,treatment,new_page,0
2893,2893,773192,2021-10-14 02:55:59.590927,treatment,new_page,0


In [10]:
matched_df = matched_df.drop_duplicates(subset=['user_id'], keep='last')

In [11]:
# Missing values
matched_df.isnull().sum()

Unnamed: 0      0
user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [13]:
# Ratio of old page and new page
matched_df[matched_df.landing_page=="new_page"].shape[0]/matched_df.shape[0]

0.5000619442226688

# Hypothesis test

In [14]:
import statsmodels.stats.proportion as sp

In [16]:
# Number of users in the control group and treatment group
n_old = matched_df.query('group=="control"').shape[0]
n_new = matched_df.query('group=="treatment"').shape[0]

In [56]:
# Number of users be converted
convert_old = matched_df.query('group=="control" & converted ==1').shape[0]
convert_new = matched_df.query('group=="treatment" & converted ==1').shape[0]


In [54]:
# Ratio of convert for new/old groups 
p_old = convert_old / n_old
p_new = convert_new / n_new
print(p_old)
print(p_new)

0.1203863045004612
0.12299222352212512


In [19]:
import statsmodels.stats.proportion as sp

# left tail
z_score, p_value = sp.proportions_ztest([convert_old, convert_new], [n_old, n_new], alternative='smaller')

In [20]:
print('z-score: ', z_score)
print('p-value: ', p_value)

z-score:  -2.1484056695589
p-value:  0.015840771394875417


In [24]:
# 0.05 confidence interval

from scipy.stats import norm
z_alpha = norm.ppf(0.05)
z_alpha

-1.6448536269514729

In [25]:
# we can reject the null hypothesis if z < z-alpha in the left tail model
# Due to z = -2.148, which less than z-alpha = -1.644
# Also, p-value < 0.05, the null hypothesis can be rejected

# Effecive side

In [52]:
from statistics import mean, stdev
from numpy import var
from math import sqrt

std_old = matched_df[matched_df.landing_page=="old_page"].converted.std()
std_new = matched_df[matched_df.landing_page=="new_page"].converted.std()

# Calculate the pooled standard deviation
s = np.sqrt(((n_old - 1)* std_old**2 + (n_new - 1)* std_new**2 ) / (n_old + n_new - 2))

# Calculate the difference between two means/SD pool
d = (p_old - p_new) / s

print('Cohen\'s d:', d)

Cohen's d: -0.007970992391336005


###  Reject null hypothesis and Cohen's effect size is very small