In [3]:
import numpy as np
import pandas as pd

# --- Settings ---
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

N = 60000
date_start = "2025-09-01"
date_end   = "2025-10-15"

channels = ["Paid Search", "Paid Social", "Email", "Affiliate", "Organic"]
channel_probs = np.array([0.35, 0.30, 0.10, 0.10, 0.15])  # sum = 1.0

# base CR per channel ("plausible-ish")
base_cr = {
    "Paid Search": 0.045,
    "Paid Social": 0.022,
    "Email":       0.070,
    "Affiliate":   0.040,
    "Organic":     0.050,
}

# Variant B effect: +X% relative uplift to CR (e.g., 8% -> multiplier 1.08)
uplift_B = 0.08  # 8%

# --- Data generation ---
df = pd.DataFrame({
    "user_id": np.arange(1, N + 1),
})

dates = pd.date_range(date_start, date_end, freq="D")
df["date"] = np.random.choice(dates, size=N, replace=True)

df["acquisition_channel"] = np.random.choice(channels, size=N, p=channel_probs)

# A/B randomization 50/50
df["variant"] = np.random.choice(["A", "B"], size=N, p=[0.5, 0.5])

# assign p_base based on channel
df["p_base"] = df["acquisition_channel"].map(base_cr).astype(float)

# final p after applying variant uplift
df["p"] = df["p_base"] * np.where(df["variant"].eq("B"), (1 + uplift_B), 1.0)

# safety cap (to avoid p > 1)
df["p"] = df["p"].clip(0, 0.99)

# sample conversions
df["converted"] = (np.random.rand(N) < df["p"]).astype(int)

df_out = df.drop(columns=["p_base", "p"])

# --- Export to CSV ---
out_path = "ab_test_data.csv"
df_out.to_csv(out_path, index=False)

print("Saved:", out_path)
print(df_out.head())
print(df_out["converted"].mean())


Saved: ab_test_data.csv
   user_id       date acquisition_channel variant  converted
0        1 2025-10-09         Paid Search       B          0
1        2 2025-09-29         Paid Social       A          0
2        3 2025-09-15         Paid Search       A          0
3        4 2025-10-13         Paid Search       A          0
4        5 2025-09-08         Paid Social       B          0
0.04198333333333333
