In [1]:
# A/B Testing is a process of conducting an experiment and testing the hypothesis between
# control and treatment groups.
# Steps involved are - Designing Experiment, Collecting Data including Sample Size,
# EDA, Testing Hypothesis, Infer the test results

# Example Designed a new type of product page and are trying to find out whether the new page
# or old page is effective in bringing leads to business. 
# Formulate a Hypothesis and wil conduct a 2 tail Hypothesis test between the new page and
# old page lead generation rate
# Po represents old page and Pn represents new page
# Null - Po=Pn
# Alt - Po!=Pn
# Confidence Level - 95% (0.05 Alpha)

# Choosing Variables - 2 Groups are needed
# Control group - They will be shown only Old page
# Treatment group(Experimental Group) - They will be shown only New Page
# Dependent variable - Lead_Generated - Yes/No

# Choosing the Right Sample Size - Using a method Power Analysis
# Power of Test (1-β) - probability of findning a statistical difference between the groups
# when the difference is actually present. Usually it is set to 0.80.
# ⍺ =0.05
# Effect Size - How big difference is expected between the groups lead generation (percentage)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import io
%cd '/Users/rajeshprabhakarkaila/Desktop/Datasets'

/Users/rajeshprabhakarkaila/Desktop/Datasets


In [4]:
abdf=pd.read_csv("ab_data.csv")

In [5]:
abdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [12]:
# Calculating Required Sample Size
from statsmodels.stats.api import NormalIndPower,proportion_effectsize
effect_size=proportion_effectsize(0.13,0.15) # Estimated Effect Size (13-15%)
required_sample=NormalIndPower().solve_power(effect_size=effect_size,power=0.80,alpha=0.05,
                                             ratio=1)

In [16]:
required_sample=np.ceil(required_sample)

In [17]:
sess_counts=abdf.user_id.value_counts(ascending=False)

In [18]:
duplicate=sess_counts[sess_counts>1].count()

In [19]:
duplicate

3894

In [20]:
duplicate_users=sess_counts[sess_counts>1].index

In [21]:
abdf=abdf[~abdf.user_id.isin(duplicate_users)]

In [22]:
abdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 286690 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       286690 non-null  int64 
 1   timestamp     286690 non-null  object
 2   group         286690 non-null  object
 3   landing_page  286690 non-null  object
 4   converted     286690 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 13.1+ MB


In [24]:
control_sample=abdf[abdf.group=="control"].sample(n=required_sample.astype(int),
                                                  random_state=42)

In [25]:
experimental_sample=abdf[abdf.group=="treatment"].sample(n=required_sample.astype(int),
                                                  random_state=42)

In [32]:
ab_test=pd.concat([control_sample,experimental_sample],axis=0)

In [33]:
ab_test.reset_index(drop=True,inplace=True)

In [34]:
ab_test.group.value_counts()

group
control      4720
treatment    4720
Name: count, dtype: int64

In [36]:
control_results=ab_test[ab_test.group=="control"]['converted']

In [38]:
experimental_results=ab_test[ab_test.group=="treatment"]['converted']

In [39]:
from statsmodels.stats.proportion import proportions_ztest

In [40]:
success=[control_results.sum(),experimental_results.sum()]

In [41]:
nobs=[control_results.count(),experimental_results.count()]

In [48]:
z_stat,pval=proportions_ztest(success,nobs,value=0)

In [49]:
z_stat

-2.0109005932696107

In [50]:
pval # Since p-value is less than 0.05, Reject Null

0.044335957690484505