In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()
from datetime import datetime

In [2]:
!pip install altair





[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Pre-test Metrics Calculation

#  User Activity

In [3]:
data = pd.read_csv("Activity_pretest.csv")
data.head()

Unnamed: 0,userid,dt,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,0


In [4]:
data.query('activity_level > 0').head(10)

Unnamed: 0,userid,dt,activity_level
909125,428070b0-083e-4c0e-8444-47bf91e99fff,2021-10-01,1
909126,93370f9c-56ef-437f-99ff-cb7c092d08a7,2021-10-01,1
909127,0fb7120a-53cf-4a51-8b52-bf07b8659bd6,2021-10-01,1
909128,ce64a9d8-07d9-4dca-908d-5e1e4568003d,2021-10-01,1
909129,e08332f0-3a5c-4ed2-b957-87e464e89b97,2021-10-01,1
909130,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,1
909131,420a60e9-6394-4324-b02c-ab372609968e,2021-10-01,1
909132,6f6b36ef-bd93-4399-a2f4-996c96d3e0a7,2021-10-01,1
909133,7dfbbc2e-6e71-4128-848d-be83df79b921,2021-10-01,1
909134,57fff942-6f0f-4de5-91c7-e7a42d518649,2021-10-01,1


In [5]:
data.activity_level.value_counts().sort_values(ascending=False)

activity_level
0     909125
5      49227
2      49074
18     48982
10     48943
16     48934
12     48911
6      48901
19     48901
11     48832
9      48820
1      48732
3      48659
14     48620
15     48599
4      48556
13     48534
8      48396
17     48395
7      48339
20     24520
Name: count, dtype: int64

In [6]:
activity = data.query('activity_level > 0').groupby(['dt', 'activity_level'])["userid"].nunique().reset_index().rename(columns={"userid":"number_of_active_users"})

activity.head(10)

Unnamed: 0,dt,activity_level,number_of_active_users
0,2021-10-01,1,1602
1,2021-10-01,2,1507
2,2021-10-01,3,1587
3,2021-10-01,4,1551
4,2021-10-01,5,1586
5,2021-10-01,6,1640
6,2021-10-01,7,1508
7,2021-10-01,8,1571
8,2021-10-01,9,1576
9,2021-10-01,10,1593


In [7]:
import altair as alt

alt.Chart(activity).mark_line(size=1).encode(
    x=alt.X('dt:T', title='Date'),
    y=alt.Y('number_of_active_users:Q', title='Number of Users'),
    color=alt.Color('activity_level:N', title='Activity Level'),
    tooltip=[
        alt.Tooltip('dt:T', title='Date'),
        alt.Tooltip('number_of_active_users:Q', title='Users'),
        alt.Tooltip('activity_level:N', title='Activity Level')
    ]
).properties(
    width=600,
    height=400,
    title='Active Users Over Time by Activity Level'
)


# Calculating Daily Active Users

In [8]:
activity = data.query('activity_level > 0').groupby(['dt'])["userid"].nunique().reset_index().rename(columns={"userid":"number_of_active_users"})
activity.head()

Unnamed: 0,dt,number_of_active_users
0,2021-10-01,30634
1,2021-10-02,30775
2,2021-10-03,30785
3,2021-10-04,30599
4,2021-10-05,30588


In [9]:
activity.describe().round(0)

Unnamed: 0,number_of_active_users
count,31.0
mean,30673.0
std,91.0
min,30489.0
25%,30608.0
50%,30661.0
75%,30728.0
max,30902.0


## We have 31 Day's Data and the Mean Daily Active Users (DAU) is 30,673 with a Stanard Deviation of 91

In [10]:
alt.Chart(activity).mark_line(size=4).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('number_of_active_users:Q', axis=alt.Axis(title = 'Number of Users'))
).properties(
    width=600,
    height=400,
    title='Daily Active Users'
)

# Click-through rate

In [11]:
data1 = pd.read_csv("Ctr_pretest.csv")

In [12]:
data1.head()

Unnamed: 0,userid,dt,ctr
0,4b328144-df4b-47b1-a804-09834942dce0,2021-10-01,34.28
1,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,34.67
2,8028cccf-19c3-4c0e-b5b2-e707e15d2d83,2021-10-01,34.77
3,652b3c9c-5e29-4bf0-9373-924687b1567e,2021-10-01,35.42
4,45b57434-4666-4b57-9798-35489dc1092a,2021-10-01,35.04


In [13]:
data1.describe().round(2)

Unnamed: 0,ctr
count,950875.0
mean,33.0
std,1.73
min,30.0
25%,31.5
50%,33.0
75%,34.5
max,36.0


# Sample Size Determination and Power Calculation

In [14]:
from scipy import stats

In [15]:
def binomial_sample_size(metric, mde, alpha, beta):
   
    snd = stats.norm(0, 1)

    Z_beta = snd.ppf(1-beta)
    print(Z_beta)

    Z_alpha = snd.ppf(1-alpha/2)
    print(Z_alpha)


    p = (metric + metric + mde) / 2
    print(p)

    N = (2 * p * (1 - p) * (Z_alpha + Z_beta)**2 / mde**2)

    return N

In [16]:
binomial_sample_size(metric=0.33, mde=0.02, alpha=0.05, beta=0.2)

0.8416212335729143
1.959963984540054
0.34


8806.443061939677

So, at least 8,807 users need to be exposed to the test

# For Continuous Distribution [Daily Active Users (DAU)]

In [17]:
def continuos_sample_size(metric, mde, sd, alpha, beta):
    # standard normal distribution to determine z-values
    snd = stats.norm(0, 1)

    Z_beta = snd.ppf(1-beta)
    print(Z_beta)

    Z_alpha = snd.ppf(1-alpha/2)
    print(Z_alpha)

    N = (2 * sd**2 * (Z_beta + Z_alpha)**2 / mde**2)

    return N

In [18]:
continuos_sample_size(metric=30673, mde=300, sd=91, alpha=0.05, beta=0.2)

0.8416212335729143
1.959963984540054


1.4443682906698845

### So, the test should run for at least 1.44 ~ 2 days


# A/B Testing Process
### We first need to randomly assign the test to 8,807 Users

In [19]:
data = pd.read_csv("Assignments.csv")
data.head()

Unnamed: 0,userid,ts,groupid
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0


In [20]:
print(datetime.strptime(data.head(1)['ts'][0], '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))

2021-11-02


In [21]:
data['dt'] = data['ts'].map(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))
data.head()

Unnamed: 0,userid,ts,groupid,dt
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0,2021-11-02
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0,2021-11-13
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0,2021-11-20
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0,2021-11-20
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0,2021-11-04


In [22]:
data.describe().round(4)

Unnamed: 0,groupid
count,60000.0
mean,0.5008
std,0.5
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [23]:
data.groupby(['groupid'])["userid"].nunique().reset_index().rename(columns={"userid":"number_of_users"})

Unnamed: 0,groupid,number_of_users
0,0,29951
1,1,30049


In [24]:
data_count = data.groupby(['groupid','dt'])["userid"].nunique().reset_index().rename(columns={"userid":"number_of_users"})
data_count.head()

Unnamed: 0,groupid,dt,number_of_users
0,0,2021-11-01,1497
1,0,2021-11-02,1467
2,0,2021-11-03,1532
3,0,2021-11-04,1509
4,0,2021-11-05,1503


In [25]:
alt.Chart(data_count).mark_line(size=3).encode(
    alt.X('dt:T'),
    alt.Y('number_of_users:Q'),
    color='groupid:O',
    tooltip=['number_of_users']
).properties(
    width=600,
    height=400
)

# Comparing Activity (activeness and activity_level) between the Groups:

In [26]:
data_act = pd.read_csv("Activity_all.csv")
data_act.head()

Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0


In [27]:
data_act.groupby(['groupid','dt']).describe().round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
groupid,dt,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,2021-10-01,29951.0,5.24,6.52,0.0,0.0,1.0,10.0,20.0
0,2021-10-02,29951.0,5.26,6.51,0.0,0.0,1.0,10.0,20.0
0,2021-10-03,29951.0,5.27,6.51,0.0,0.0,1.0,10.0,20.0
0,2021-10-04,29951.0,5.21,6.51,0.0,0.0,1.0,10.0,20.0
0,2021-10-05,29951.0,5.18,6.51,0.0,0.0,1.0,10.0,20.0
...,...,...,...,...,...,...,...,...,...
1,2021-11-26,30049.0,10.03,5.77,0.0,5.0,10.0,15.0,20.0
1,2021-11-27,30049.0,10.03,5.77,0.0,5.0,10.0,15.0,20.0
1,2021-11-28,30049.0,9.98,5.79,0.0,5.0,10.0,15.0,20.0
1,2021-11-29,30049.0,9.97,5.80,0.0,5.0,10.0,15.0,20.0


#### We can clearly observe that the Mean and Median Activity Level of the Test Group (who were exposed to the new ads) is way higher than the Control Group (who were not exposed to the new ads)

In [28]:
data_act.query('activity_level > 0').groupby(['dt', 'groupid'])['userid'].nunique().reset_index().rename(columns={"userid":"number_of_active_users"}).head()

Unnamed: 0,dt,groupid,number_of_active_users
0,2021-10-01,0,15337
1,2021-10-01,1,15297
2,2021-10-02,0,15354
3,2021-10-02,1,15421
4,2021-10-03,0,15423


In [29]:
alt.Chart(data_act.query('activity_level > 0').groupby(['dt', 'groupid'])['userid'].nunique().reset_index().rename(columns={"userid":"number_of_active_users"})).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('number_of_active_users'),
    color='groupid:O',
    tooltip=['number_of_active_users']
).properties(
    width=900,
    height=600
)

##### So, we can see that after the test starts (1st of November, 2021) the test group (groupid = 1) has way more Number of Active Users. This is good for the business. The new adds aren't driving away our active users.

### Control Group's Active User's Statistics after the Test Starts:

In [30]:
(
    data_act.query('activity_level > 0 and groupid == 0 and dt >= "2021-11-01"')
    .groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe().round(2)
)

Unnamed: 0,groupid,activity_level
count,30.0,30.0
mean,0.0,15782.0
std,0.0,371.08
min,0.0,15163.0
25%,0.0,15335.0
50%,0.0,15990.5
75%,0.0,16045.0
max,0.0,16147.0


In [31]:
data_act.query('dt >= "2021-11-01"').groupby(['groupid']).describe().round(2)

Unnamed: 0_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,898530.0,5.4,6.56,0.0,0.0,1.0,11.0,20.0
1,901470.0,10.0,5.79,0.0,5.0,10.0,15.0,20.0


In [32]:
data_act.query('dt < "2021-11-01"').groupby('groupid').describe().round(2)

Unnamed: 0_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,928481.0,5.25,6.52,0.0,0.0,1.0,10.0,20.0
1,931519.0,5.24,6.52,0.0,0.0,1.0,10.0,20.0


# The Tests
## By the Activity Level (Guardrail Metric)

In [33]:
from scipy.stats import ttest_ind

In [34]:
data_act.query('groupid == 0')['activity_level'].head()

0    0
1    0
3    0
6    0
7    0
Name: activity_level, dtype: int64

In [35]:
data_act.query('groupid == 0')['activity_level'].to_numpy()

array([ 0,  0,  0, ..., 20, 20, 20], dtype=int64)

In [36]:
res = ttest_ind(data_act.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy(),
                data_act.query('groupid == 1 and dt >= "2021-11-01"')['activity_level'].to_numpy()).pvalue

print(res)

0.0


### The p-value is very small (<0.05) and hence we can reject the null hypothesis that the Mean Activity Level between the Test and Control Group is Equal at 5% of significance.

# By the Number of Active Users (Guardrail Metric)

In [37]:
data_act_count = data_act.query('activity_level > 0').groupby(['dt','groupid'])["userid"].nunique().reset_index().rename(columns={"userid":"number_of_active_users"})
data_act_count.head()

Unnamed: 0,dt,groupid,number_of_active_users
0,2021-10-01,0,15337
1,2021-10-01,1,15297
2,2021-10-02,0,15354
3,2021-10-02,1,15421
4,2021-10-03,0,15423


In [38]:
before = data_act_count.query('dt < "2021-11-01"')

In [39]:
before.head()

Unnamed: 0,dt,groupid,number_of_active_users
0,2021-10-01,0,15337
1,2021-10-01,1,15297
2,2021-10-02,0,15354
3,2021-10-02,1,15421
4,2021-10-03,0,15423


In [40]:
after = data_act_count.query('dt >= "2021-11-01"')

In [41]:
after.head()

Unnamed: 0,dt,groupid,number_of_active_users
62,2021-11-01,0,15989
63,2021-11-01,1,29318
64,2021-11-02,0,16024
65,2021-11-02,1,29289
66,2021-11-03,0,16049


#### Checking for the Pre-test Bias on Activity:

In [42]:
np.mean(before.query('groupid == 0')['number_of_active_users'].to_numpy())

15320.870967741936

In [43]:
np.mean(before.query('groupid == 1')['number_of_active_users'].to_numpy())

15352.516129032258

#### So, before the test started the Mean Daily Active Users between the groups were similar. So, no pre-test Bias Exists.

In [47]:
res = ttest_ind(before.query('groupid == 0')['number_of_active_users'].to_numpy(), before.query('groupid == 1')['number_of_active_users']
                .to_numpy()).pvalue

print(res)

0.16308423538280842


#### The p-value (>0.05) also suggests that the Mean DAU weren't significantly different at 5% Level of Significance before the test. Hence, no pre-test bias existed

## after the test starts:

In [48]:
np.mean(after.query('groupid==0')['number_of_active_users'].to_numpy())

15782.0

In [49]:
np.mean(after.query('groupid == 1')['number_of_active_users'].to_numpy())

29302.433333333334

### A clear difference between the groups. But, performing  hypothesis test will help to find the  inference about the population:

In [50]:
res = ttest_ind(after.query('groupid == 0')['number_of_active_users'].to_numpy(), after.query('groupid == 1')['number_of_active_users']
                .to_numpy()).pvalue

print(res)

6.590603584107244e-84


In [51]:
"{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000065906035841072442'

##### The p-value is very small (<0.05) and hence we can reject the null hypothesis that the Mean Daily Active Users (DAU) between the Test and Control Group is Equal at 5% of significance after the test starts.

## Click through rate (CTR) [Success Metric]

In [52]:
data_ctr = pd.read_csv("Ctr_all.csv")
data_ctr.head()

Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95


In [53]:
data_ctr_avg = data_ctr.groupby(['groupid','dt'])["ctr"].mean().reset_index().rename(columns={"ctr":"avg_daily_ctr"})
data_ctr_avg

Unnamed: 0,groupid,dt,avg_daily_ctr
0,0,2021-10-01,32.980627
1,0,2021-10-02,33.004056
2,0,2021-10-03,33.002006
3,0,2021-10-04,32.990363
4,0,2021-10-05,33.014167
...,...,...,...
117,1,2021-11-26,37.997834
118,1,2021-11-27,37.978912
119,1,2021-11-28,37.992709
120,1,2021-11-29,37.987909


In [54]:
alt.Chart(data_ctr_avg).mark_line(size=5).encode(
    alt.X('dt'),
    alt.Y('avg_daily_ctr'),
    color='groupid:O',
    tooltip=['avg_daily_ctr']
).properties(
    width=600,
    height=400
)

#### A Clear Increament between the groups after the test started.

In [55]:
before = data_ctr.query('dt < "2021-11-01"')[['groupid', 'ctr']]
before

Unnamed: 0,groupid,ctr
808703,0,34.28
808704,0,34.67
808705,0,34.77
808706,0,35.42
808707,0,35.04
...,...,...
1759573,1,32.33
1759574,1,30.09
1759575,1,35.71
1759576,1,34.76


In [56]:
after = data_ctr.query('dt >= "2021-11-01"')[['groupid', 'ctr']]
after

Unnamed: 0,groupid,ctr
0,0,31.81
1,0,30.46
2,0,34.25
3,0,34.92
4,0,34.95
...,...,...
2303403,1,37.27
2303404,1,39.14
2303405,1,40.05
2303406,1,38.14


In [57]:
before.query('groupid == 0')['ctr'].to_numpy().mean()

33.00091277553074

In [58]:
before.query('groupid == 1')['ctr'].to_numpy().mean()

32.99957172093258

##### Before the test the mean CTR was similar.

In [59]:
after.query('groupid == 0')['ctr'].to_numpy().mean()

32.996977569382835

In [60]:
after.query('groupid == 1')['ctr'].to_numpy().mean()

37.99695912626142

### But, after the test started there was a clear improvement in the mean CTR.

In [61]:
before.query('groupid == 0')['ctr'].to_numpy().std()

1.7336979501682888

In [62]:
before.query('groupid == 1')['ctr'].to_numpy().std()

1.7296548367391134

In [63]:
after.query('groupid == 0')['ctr'].to_numpy().std()

1.7331985918552912

In [None]:
after.query('groupid == 1')['ctr'].to_numpy().std()

### performing hypothesis tests:

In [64]:
res = ttest_ind(before.query('groupid == 0')['ctr'].to_numpy(), before.query('groupid == 1')['ctr']
                .to_numpy()).pvalue

print(res)

0.705741417344299


#### High p-value (p > 0.05); Hence, failed to reject the null hypothesis (mean ctr between the groups is equal) at 5% level of significance.

##### After the Test:

In [65]:
res = ttest_ind(after.query('groupid == 0')['ctr'].to_numpy(), after.query('groupid == 1')['ctr']
                .to_numpy()).pvalue
print(res)

0.0


In [66]:
"{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'

#### Very Low p-value (p < 0.05); Hence, rejecting the null hypothesis (mean ctr between the groups is equal) at 5% level of significance.

Hence, we can conclude that the new ad policy is a hit interms of our success metric (CTR) as well as our Guardrail Metrics (Daily Active Users and Daily Activity Level).