In [None]:
#In this project we run AB Testing on a website data over an implementation of new features
#which aim to increase the number of page click in the website

#We have got the user's IP Address and user's actions in different versions of the website
#Also we have two servers:1st one with an old version of website (control group) 
#and 2st one with the implemented new features (test group)

#The objective is to determine whether user's experiense in these two versions of the website is different and if so, 
#how statically significant this difference 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib as plt
import seaborn as sns
from scipy.stats import shapiro,mannwhitneyu

In [11]:
df = pd.read_csv("abtest_data.csv")
df.head()

Unnamed: 0,RecordID,IP Address,LoggedInFlag,ServerID,VisitPageFlag
0,1,39.13.114.2,1,2,0
1,2,13.3.25.8,1,1,0
2,3,247.8.211.8,1,1,0
3,4,60.10.192.7,0,2,0
4,5,195.12.126.2,1,1,0


In [12]:
df.shape

(123054, 5)

In [13]:
#Number of unique values in IP Address
df['IP Address'].nunique()

66334

In [14]:
#As the values in IP Address are duplicated (one user could enter the website several times)
#I will group the data by IP Address and put 1 in the visit value for users with multiple visits

In [15]:
df = df.groupby(["IP Address", "LoggedInFlag", "ServerID"])["VisitPageFlag"].sum()
df = df.reset_index(name="VisitPageSum")
df.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageSum
0,0.0.108.2,0,1,0
1,0.0.109.6,1,1,0
2,0.0.160.9,1,2,0
3,0.0.163.1,0,2,0
4,0.0.169.1,1,1,0


In [16]:
df["VisitPageSum"].value_counts()

0    59589
1     6368
2      517
3       12
4        1
Name: VisitPageSum, dtype: int64

In [17]:
df["VisitPage"] = df["VisitPageSum"].apply(lambda x: 1 if x != 0 else 0)
df.head()

Unnamed: 0,IP Address,LoggedInFlag,ServerID,VisitPageSum,VisitPage
0,0.0.108.2,0,1,0,0
1,0.0.109.6,1,1,0,0
2,0.0.160.9,1,2,0,0
3,0.0.163.1,0,2,0,0
4,0.0.169.1,1,1,0,0


In [18]:
df["VisitPage"].value_counts()

0    59589
1     6898
Name: VisitPage, dtype: int64

In [19]:
#Spliting the dataset into control and test groups 

df['Group'] = df['ServerID'].map({1:'Test', 2:'Control'})
df.drop(['ServerID','VisitPageSum'],axis=1, inplace=True)
df.head()

Unnamed: 0,IP Address,LoggedInFlag,VisitPage,Group
0,0.0.108.2,0,0,Test
1,0.0.109.6,1,0,Test
2,0.0.160.9,1,0,Control
3,0.0.163.1,0,0,Control
4,0.0.169.1,1,0,Test


In [20]:
df_control = df[df['Group'] == 'Control'].copy()
df_control.reset_index(inplace=True, drop = True)
df_control

Unnamed: 0,IP Address,LoggedInFlag,VisitPage,Group
0,0.0.160.9,1,0,Control
1,0.0.163.1,0,0,Control
2,0.0.178.9,1,0,Control
3,0.0.228.7,0,0,Control
4,0.0.46.2,0,0,Control
...,...,...,...,...
33179,99.9.3.3,1,0,Control
33180,99.9.45.1,1,0,Control
33181,99.9.53.7,1,0,Control
33182,99.9.65.2,0,0,Control


In [21]:
df_test = df[df['Group'] == 'Test'].copy()
df_test.reset_index(inplace=True, drop = True)
df_test

Unnamed: 0,IP Address,LoggedInFlag,VisitPage,Group
0,0.0.108.2,0,0,Test
1,0.0.109.6,1,0,Test
2,0.0.169.1,1,0,Test
3,0.0.181.9,0,1,Test
4,0.0.195.5,1,0,Test
...,...,...,...,...
33298,99.9.206.2,0,0,Test
33299,99.9.241.9,1,0,Test
33300,99.9.5.3,1,0,Test
33301,99.9.86.3,0,1,Test


In [22]:
df_control.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LoggedInFlag,33184.0,0.509251,0.499922,0.0,0.0,1.0,1.0,1.0
VisitPage,33184.0,0.091942,0.288948,0.0,0.0,0.0,0.0,1.0


In [23]:
df_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LoggedInFlag,33303.0,0.503258,0.499997,0.0,0.0,1.0,1.0,1.0
VisitPage,33303.0,0.115515,0.319647,0.0,0.0,0.0,0.0,1.0


In [24]:
#In the control data set,let's calculate the ratio of those 
#who enter the page we want, compared to all those who enter the site

control_sum_visit = df_control['VisitPage'].count()
print("Total visits for control group: ", control_sum_visit)
control_visit_1 = df_control[df_control['VisitPage'] ==1]['VisitPage'].count()
print("Visit Page target = 1 : ", control_visit_1)

Total visits for control group:  33184
Visit Page target = 1 :  3051


In [25]:
control_ratio_visit = round(control_visit_1/control_sum_visit,4)
control_ratio_visit

0.0919

In [27]:
#The same ratio within the Test group

test_sum_visit = df_test['VisitPage'].count()
print("Sum visit for test group: ", test_sum_visit)
test_visit_1 = df_test[df_test['VisitPage'] ==1]['VisitPage'].count()
print("Visit Page target = 1 : ", test_visit_1)

Sum visit for test group:  33303
Visit Page target = 1 :  3847


In [28]:
test_ratio_visit = round(test_visit_1/test_sum_visit,4)
test_ratio_visit

0.1155

In [None]:
#It seems that the new features applied to the test group is getting more clicks
#To prove it statistically we will run A/B testing

In [29]:
#As we working with binary data it can not follow the normal distrubition rule
#However let us check it by shapiro test

#In this test If the p-value is less than the significance level, we reject the null hypothesis  
#and conclude that the samples does not come from a normal distribution

#H0 : The samples come from a normal distribution
#H1 : The samples does not come from a normal distribution

In [32]:
test_stat, pvalue = shapiro(df_control["VisitPage"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.3260, p-value = 0.0000


In [33]:
test_stat, pvalue = shapiro(df_test["VisitPage"])
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = 0.3711, p-value = 0.0000


In [None]:
#As the assumption of normality is not provided, we will use the Mann-Whitney U test 

#H0 the two populations are equal
#H1 the two populations are not equal

In [34]:
test_stat, pvalue = mannwhitneyu(df_control["VisitPage"],
                                 df_test["VisitPage"])
print('Test Stat = %.2f, p-value = %.2f' % (test_stat, pvalue))

Test Stat = 539537678.50, p-value = 0.00


In [None]:
#As p-value is certainly less than significance level, we reject the null hypothesis  
#and conclude that there is a significant difference between these two groups

In [35]:
group_count = df.groupby(['Group', 'VisitPage'])['Group'].count().reset_index(name='Count')
groupped = pd.crosstab(group_count['Group'], group_count['VisitPage'], values=group_count['Count'], aggfunc=np.sum, margins=True)

In [36]:
#in % values
100*groupped.div(groupped['All'], axis=0)

VisitPage,0,1,All
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Control,90.80581,9.19419,100.0
Test,88.448488,11.551512,100.0
All,89.625039,10.374961,100.0


In [None]:
#The rate of clicking on the page was 9.22% in the control group, while this rate increased to 11.55% in the test group
#This increase is not accidental, and has been proven statistically