In [None]:
# ===============================================
# 📊 A/B Test Dashboard Data Preparation & Analysis
# ===============================================

# ----- 1. Import required libraries -----
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# ----- 2. Read your CSV (replace with your file) -----
# Example:
# df = pd.read_csv("your_ab_test_data.csv")

# For demo, let's generate synthetic data with the same structure:
np.random.seed(42)
n = 2000
process_steps = ["start", "step_1", "step_2", "step_3", "confirm", "finish", "error"]
variations = ["Control", "Test"]

data = []
for i in range(n):
    client_id = f"C{i:05d}"
    visitor_id = f"V{i:05d}"
    visit_id = f"VIS{i:05d}"
    variation = np.random.choice(variations)
    age = np.random.randint(18, 70)
    gender = np.random.choice(["M", "F"])
    number_of_accounts = np.random.randint(1, 5)
    balance = np.random.randint(100, 5000)
    calls_6_months = np.random.randint(0, 10)
    logons_6_months = np.random.randint(1, 30)
    tenure_years = np.random.randint(0, 10)
    tenure_months = np.random.randint(0, 12)

    start_time = datetime(2025, 5, np.random.randint(1, 28), np.random.randint(8, 22), np.random.randint(0, 60))
    steps = np.random.choice(process_steps[:-1], np.random.randint(4, 7), replace=False)
    if np.random.rand() < 0.1:
        steps = np.append(steps, "error")

    for j, step in enumerate(steps):
        dt = start_time + timedelta(seconds=j * np.random.randint(5, 60))
        data.append([
            client_id, visitor_id, visit_id, step, dt, variation,
            tenure_years, tenure_months, age, gender,
            number_of_accounts, balance, calls_6_months, logons_6_months
        ])

columns = [
    "client_id", "visitor_id", "visit_id", "process_step", "date_time", "variation",
    "client_tenure_years", "client_tenure_months", "age", "gender",
    "number_of_accounts", "balance", "calls_6_months", "logons_6_months"
]
df = pd.DataFrame(data, columns=columns)

# ----- 3. Data Cleaning -----
df["date_time"] = pd.to_datetime(df["date_time"], errors='coerce')
df = df.dropna(subset=["date_time"])
df = df.sort_values(by=["visit_id", "date_time"])

# Calculate time spent per step
df["next_time"] = df.groupby("visit_id")["date_time"].shift(-1)
df["step_duration_s"] = (df["next_time"] - df["date_time"]).dt.total_seconds()

# Flag errors and completions
df["is_error"] = df["process_step"].str.contains("error", case=False)
visit_summary = df.groupby("visit_id").agg({
    "variation": "first",
    "client_id": "first",
    "is_error": "max",
    "process_step": lambda x: "finish" in list(x),
    "date_time": ["min", "max"]
})
visit_summary.columns = ["variation", "client_id", "has_error", "has_finish", "start_time", "end_time"]
visit_summary = visit_summary.reset_index()

# Compute total visit time
visit_summary["total_time_s"] = (visit_summary["end_time"] - visit_summary["start_time"]).dt.total_seconds()
visit_summary["completed"] = visit_summary["has_finish"].astype(int)
visit_summary["errored"] = visit_summary["has_error"].astype(int)

# ----- 4. Compute KPIs -----
ab_metrics = visit_summary.groupby("variation").agg({
    "visit_id": "count",
    "completed": "mean",
    "errored": "mean",
    "total_time_s": "mean"
}).rename(columns={
    "visit_id": "visits_count",
    "completed": "completion_rate",
    "errored": "error_rate",
    "total_time_s": "avg_time_s"
}).reset_index()

print("\n=== A/B Summary Metrics ===")
print(ab_metrics)

# ----- 5. Step-Level Analysis -----
step_time = df.groupby(["variation", "process_step"])["step_duration_s"].mean().reset_index()

# ----- 6. Funnel Conversion -----
funnel = df.groupby(["variation", "process_step"])["visit_id"].nunique().reset_index()
total_visits = df.groupby("variation")["visit_id"].nunique().reset_index(name="total_visits")
funnel = funnel.merge(total_visits, on="variation")
funnel["reach_rate"] = funnel["visit_id"] / funnel["total_visits"]

# ----- 7. Visualizations -----
# A) Completion rate by variation
plt.figure(figsize=(6,4))
plt.bar(ab_metrics["variation"], ab_metrics["completion_rate"], color="skyblue")
plt.title("Completion Rate by Variation")
plt.ylabel("Completion Rate")
plt.ylim(0,1)
for i, v in enumerate(ab_metrics["completion_rate"]):
    plt.text(i, v+0.02, f"{v:.2%}", ha='center')
plt.show()

# B) Error rate by variation
plt.figure(figsize=(6,4))
plt.bar(ab_metrics["variation"], ab_metrics["error_rate"], color="salmon")
plt.title("Error Rate by Variation")
plt.ylabel("Error Rate")
plt.ylim(0,1)
for i, v in enumerate(ab_metrics["error_rate"]):
    plt.text(i, v+0.02, f"{v:.2%}", ha='center')
plt.show()

# C) Average time spent per variation
plt.figure(figsize=(6,4))
plt.bar(ab_metrics["variation"], ab_metrics["avg_time_s"], color="lightgreen")
plt.title("Average Total Time (s) per Visit by Variation")
plt.ylabel("Avg Time (s)")
plt.show()

# D) Step duration comparison
for var in step_time["variation"].unique():
    subset = step_time[step_time["variation"] == var]
    plt.figure(figsize=(7,4))
    plt.bar(subset["process_step"], subset["step_duration_s"])
    plt.title(f"Mean Step Duration - {var}")
    plt.ylabel("Seconds")
    plt.xticks(rotation=45)
    plt.show()

# E) Funnel reach rates
for var in funnel["variation"].unique():
    subset = funnel[funnel["variation"] == var]
    plt.figure(figsize=(7,4))
    plt.bar(subset["process_step"], subset["reach_rate"], color="orange")
    plt.title(f"Funnel Reach Rate - {var}")
    plt.ylabel("Reach Rate")
    plt.ylim(0,1)
    plt.xticks(rotation=45)
    plt.show()

# ----- 8. Export cleaned data -----
df.to_csv("cleaned_events.csv", index=False)
visit_summary.to_csv("visits_summary.csv", index=False)
print("\n✅ Cleaned CSVs saved: 'cleaned_events.csv' and 'visits_summary.csv'")