In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import seaborn as sns
from math import ceil

In [3]:
# Load dataset
df = pd.read_csv('ab_data.csv')
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
session_counts = df['user_id'].value_counts(ascending = False)
multi_users = session_counts[session_counts > 1].count()

In [5]:
# dropping with multiple session counts
users_to_drop = session_counts[session_counts > 1].index

df = df[~ df['user_id'].isin(users_to_drop)]

In [13]:
# Sample control and treatment group
control_sample = df[df['group'] == 'control'].sample(n = 5000, random_state = 12)
treatment_sample = df[df['group'] == 'treatment'].sample(n = 5000, random_state = 12)
ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)
# Define functions for standard deviation and standard error
std_dev = lambda x : np.std(x, ddof = 0)
std_error = lambda x : stats.sem(x, ddof = 0)
conversion_rate = ab_test.groupby('group')['converted'].agg(['mean', std_dev, std_error])
conversion_rate.columns = ['conversion_rate', 'std_deviation', 'std_error']
conversion_rate

Unnamed: 0_level_0,conversion_rate,std_deviation,std_error
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,0.1144,0.318296,0.004501
treatment,0.1218,0.327055,0.004625


In [7]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [8]:
control_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']

In [9]:
num_control = control_results.count()
num_treatment = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [num_control, num_treatment]

In [10]:
z_stat, pval = proportions_ztest(successes, nobs = nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)

In [11]:
print(f'Z Statistic - {z_stat:.2f}')
print(f'P-Value - {pval:.3f}')
print(f'CI 95% for control group - [{lower_con:.3f}, {upper_con:.3f}]')
print(f'CI 95% for treatment group - [{lower_treat:.3f}, {upper_treat:.3f}]')

Z Statistic - -1.15
P-Value - 0.252
CI 95% for control group - [0.106, 0.123]
CI 95% for treatment group - [0.113, 0.131]
