In [5]:
# import libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# set random seed
np.random.seed(213891283)

# set number of users for each group (total will be double this number)
n_users = 5000

# start and end dates
start_date = datetime(2025, 3, 1)
end_date = datetime(2025, 3, 31, 23, 59, 59)

# generate unique user IDs
user_ids = np.arange(1, 2 * n_users + 1)

# shuffle user IDs
np.random.shuffle(user_ids)

# assign user IDs to groups
group_a_ids = user_ids[:n_users]
group_b_ids = user_ids[n_users:]

# known conversion rate for group a (control)
group_a_conv_rate = 0.08

# create group a
group_a = pd.DataFrame({
    'user_id': group_a_ids,
    'timestamp': [start_date + timedelta(seconds=np.random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(n_users)],
    'group': 'control',
    'email_version': 'old',
    'converted': np.random.binomial(1, group_a_conv_rate, n_users)
})

# eandom conversion rate for group b (treatment)
group_b_conv_rate = np.random.uniform(0.07, 0.12)

# keep this line commented if you don't want to know the conversion rate
print(f"Group B conversion rate: {group_b_conv_rate:.4f}")

# create group b
group_b = pd.DataFrame({
    'user_id': group_b_ids,
    'timestamp': [start_date + timedelta(seconds=np.random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(n_users)],
    'group': 'treatment',
    'email_version': 'new',
    'converted': np.random.binomial(1, group_b_conv_rate, n_users)
})

# combine dfs and shuffle
df = pd.concat([group_a, group_b], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)


Group B conversion rate: 0.1093


In [6]:
# check the data
df.head(20)

Unnamed: 0,user_id,timestamp,group,email_version,converted
0,1908,2025-03-26 06:04:09,treatment,new,0
1,571,2025-03-28 12:43:25,control,old,0
2,412,2025-03-14 18:26:57,control,old,0
3,6637,2025-03-06 06:33:45,control,old,0
4,5839,2025-03-07 07:34:41,control,old,0
5,4022,2025-03-24 16:46:14,treatment,new,0
6,2469,2025-03-17 04:32:32,control,old,0
7,4936,2025-03-11 11:59:26,treatment,new,0
8,5270,2025-03-11 20:09:13,treatment,new,0
9,8883,2025-03-10 19:01:42,control,old,0


In [7]:
# save to CSV
df.to_csv("ab_test_data.csv", index=False)