In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from typing import List

# import custom package
from ab_testing_kit.data_splitting.sample_size_calculation import sample_size_calc, pretest_bias

pd.pandas.set_option('display.max_rows',None)
pd.pandas.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth', None)


from sklearn import set_config
set_config(transform_output = 'pandas')

import warnings
warnings.simplefilter(action="ignore")

# Import Data

In [2]:
FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/marketing_campaign_aggregated.csv"

In [3]:
df = pd.read_csv(FILE_PATH)

df[:4]

Unnamed: 0,user_id,date,ctr,revenue,daily_users_num
0,1,2024-06-23,0.0,0.0,32891
1,1,2024-07-02,0.0,0.0,1186
2,4,2024-06-23,0.0,0.0,32891
3,4,2024-07-09,0.0,0.0,1202


In [4]:
test_date = '2024-06-24'

# Split into pretest and after test

In [5]:
before_test = df[df['date'] < test_date]

after_test = df[df['date'] >= test_date]

len(before_test), len(after_test)

(58636, 41905)

In [6]:
before_test[before_test['user_id'] == 4]

Unnamed: 0,user_id,date,ctr,revenue,daily_users_num
2,4,2024-06-23,0.0,0.0,32891


In [7]:
after_test[after_test['user_id'] == 4]

Unnamed: 0,user_id,date,ctr,revenue,daily_users_num
3,4,2024-07-09,0.0,0.0,1202
4,4,2024-07-24,0.0,0.0,1152


## Separate daily users

In [8]:
dau_before_agg = before_test[['date', 'daily_users_num']].drop_duplicates()

dau_after_agg = after_test[['date', 'daily_users_num']].drop_duplicates()

In [9]:
before_test.drop('daily_users_num', inplace = True, axis =1)

after_test.drop('daily_users_num', inplace = True, axis = 1)

## Calculate sample size

In [10]:
mean = before_test['ctr'].mean()
std =  before_test['ctr'].std()

In [11]:
sample_size_calc(mean, 5, 0.05, 0.2, tail_type = 1, 
                 std_metric = std, metric_type = 'continuous')

{'sample_size': 898.0,
 'z_beta': 0.8416212335729143,
 'z_alpha': 1.6448536269514722}

## Select Users Randomly

In [12]:
mde = 5

In [13]:
test_sample_size = 898
control_sample_size = test_sample_size + mde

# get a unique list of all users
unique_users = before_test['user_id'].unique().tolist()
len(unique_users)

32891

In [14]:
# select random users without replacement
all_groups = np.random.choice(unique_users, control_sample_size+test_sample_size, replace=False)

In [15]:
# split into test and control

test_group = all_groups[898:]

control_group = all_groups[:898]

## Aggregate Data into Daily values
Get the average ctr/revenue per user and last activity date

In [16]:
before_test_agg = before_test.groupby('user_id', as_index = False).agg({'date':'max',
                                                     'ctr':'mean',
                                                     'revenue':'mean'})

In [17]:
after_test_agg = after_test.groupby('user_id', as_index = False).agg({'date':'max',
                                                     'ctr':'mean',
                                                     'revenue':'mean'})

### Extract Test and Control Group

In [18]:
# filter users concerned with the test
before_test_agg_all = before_test_agg[before_test_agg['user_id'].isin(all_groups)]

after_test_agg_all = after_test_agg[after_test_agg['user_id'].isin(all_groups)]

In [19]:
# identify test and control

after_test_agg_all['is_test_group'] = np.where(after_test_agg_all['user_id'].isin(test_group), 1,
np.where(after_test_agg_all['user_id'].isin(control_group),0, after_test_agg_all['user_id'])
        )

In [20]:
after_test_agg_all[:1]

Unnamed: 0,user_id,date,ctr,revenue,is_test_group
40,115,2024-07-03,0.0,0.0,1


In [21]:
# identify test and control

before_test_agg_all['is_test_group'] = np.where(before_test_agg_all['user_id'].isin(test_group), 1,
np.where(before_test_agg_all['user_id'].isin(control_group),0, before_test_agg_all['user_id'])
        )

## Pretest bias

Calculate the T-test for the means of two independent samples of scores.

This is a test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default''


In [22]:
pretest_bias(dau_before_agg, dau_after_agg, ['daily_users_num'])

We fail to reject the null hypothesis (H0). There is no significant difference in average daily_users_num between the test and control groups.


In [23]:
metrics = ['ctr', 'revenue']

# Split data into test and control groups
test_before = before_test_agg_all[before_test_agg_all['is_test_group'] == 1]
control_before = before_test_agg_all[before_test_agg_all['is_test_group'] == 0]


pretest_bias(test_before, control_before, metrics)

We fail to reject the null hypothesis (H0). There is no significant difference in average ctr between the test and control groups.
We fail to reject the null hypothesis (H0). There is no significant difference in average revenue between the test and control groups.


### Save data

In [24]:
FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/before_test.csv"

before_test_agg_all.to_csv(FILE_PATH, index = False)

In [25]:
FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/after_test.csv"

after_test_agg_all.to_csv(FILE_PATH, index = False)

In [26]:
all_data = pd.concat([before_test_agg_all,after_test_agg_all])

FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/all_test_data.csv"

all_data.to_csv(FILE_PATH, index = False)

In [27]:
FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/before_daily_active_users.csv"

dau_before_agg.to_csv(FILE_PATH, index = False)

In [28]:
FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/after_daily_active_users.csv"

dau_after_agg.to_csv(FILE_PATH, index = False)

In [29]:
all_data_dau = pd.concat([dau_before_agg,dau_after_agg])

FILE_PATH = "C:/Users/Oamen/OneDrive/Documents/DATA PROJECTS/AB_Testing_Kit/Datasets/all_test_data_dau.csv"

all_data_dau.to_csv(FILE_PATH, index = False)