## Setup and Load Data

In [None]:
# === Setup: libraries and config ===
import os
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yaml

# Load config.yaml
try:
    with open("../config.yaml", "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)
    print("Config loaded successfully")
except Exception as e:
    raise FileNotFoundError("config.yaml not found at ../config.yaml") from e

print("Top-level keys:", list(config.keys()))

# === Load clean dataframe (df_full) ===
clean_cfg = config.get("data", {}).get("clean", {})
if "df_full" not in clean_cfg:
    raise KeyError("'df_full' no está definido en config['data']['clean']")

path = Path(clean_cfg["df_full"])
if not path.exists():
    raise FileNotFoundError(f"No encuentro el archivo en: {path}")

df_full = pd.read_pickle(path)
print(f" df_full loaded successfully: {df_full.shape}")
display(df_full.head())

  

In [None]:
# === Check columns, types and null values (df_full) ===

print("df_full shape:", df_full.shape)

print("\nFirst 2 rows:")
display(df_full.head(2))

print("\nInfo:")
print(df_full.info())

print("\nMissing values per column (first 10 shown):")
print(df_full.isna().sum().head(10))


In [None]:
df_web = df_full.copy()
print(df_web.shape)
df_web.head(3)



In [None]:
df_full.columns.tolist()


## Exploratory Analysis – Demographics

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Age distribution
plt.figure(figsize=(8,5))
sns.histplot(df_full["clnt_age"].dropna(), bins=30, kde=True, color="steelblue")
plt.title("Client Age Distribution", fontsize=14)
plt.xlabel("Age (years)")
plt.ylabel("Count")
plt.show()

# Gender distribution
plt.figure(figsize=(6,4))
sns.countplot(
    x="gendr", 
    hue="gendr",  
    data=df_full, 
    order=df_full["gendr"].value_counts().index, 
    palette="Set2", 
    legend=False   
)
plt.title("Gender Distribution", fontsize=14)
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

# Tenure in years
plt.figure(figsize=(8,5))
sns.histplot(df_full["clnt_tenure_yr"].dropna(), bins=30, kde=True, color="darkgreen")
plt.title("Client Tenure (years)", fontsize=14)
plt.xlabel("Years with Vanguard")
plt.ylabel("Count")
plt.show()

# Balance distribution (log scale to handle skewness)
plt.figure(figsize=(8,5))
sns.boxplot(x=np.log1p(df_full["bal"].dropna()), color="darkred")
plt.title("Client Balance Distribution (log scale)", fontsize=14)
plt.xlabel("Log(1 + Balance)")
plt.show()


In [None]:
# Merge demographics + variation at client level (adaptado a df_full)
demo_var = df_full.copy()

# Tidy categories
demo_var["Variation"] = demo_var["Variation"].fillna("Non-Experiment")
demo_var["gendr"] = demo_var["gendr"].fillna("Unknown")

# (Opcional) ordena categorías para consistencia visual
gendr_order = demo_var["gendr"].value_counts().index.tolist()  # p.ej., ['U','M','F'] o similar
var_order = ["Control", "Test", "Non-Experiment"]  # mostrará las que existan en el df


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(7,5))
sns.countplot(
    data=demo_var,
    x="gendr",
    hue="Variation",     
    order=gendr_order,
    hue_order=[v for v in var_order if v in demo_var["Variation"].unique()],
    palette="Set2"
)
plt.title("Gender distribution by variation", fontsize=14)
plt.xlabel("Gender"); plt.ylabel("Number of clients")
plt.legend(title="Variation")
plt.tight_layout()
plt.show()


In [None]:
# Demographics overview WITH variation (Control / Test / Non-Experiment)
demo_var = df_full.copy()

# Normalizar columna de Variation (ya viene en df_full)
if "Variation" in demo_var.columns:
    demo_var = demo_var.rename(columns={"Variation": "variation"})

# Fill NaN = Non-Experiment
demo_var["variation"] = demo_var["variation"].fillna("Non-Experiment")

# Conteo de clientes por grupo
print(demo_var["variation"].value_counts())

# Summaries by variation group
for group, subset in demo_var.groupby("variation"):
    print(f"\n===== {group} =====")
    print("Age summary:\n", subset["clnt_age"].describe())
    print("\nGender distribution:\n", subset["gendr"].value_counts(dropna=False))
    print("\nTenure (years) summary:\n", subset["clnt_tenure_yr"].describe())
    print("\nBalance summary:\n", subset["bal"].describe())
    print("\n-----------------------------")


##### Day 2 – Demographic Analysis  

We explored the cleaned client dataset (`df_full`) to better understand who the primary users of Vanguard’s online process are.  
This analysis distinguishes between **Control**, **Test**, and **Non-Experiment** clients.  

### Age  
- Overall average age ~46 (median 47).  
- **Non-Experiment** clients are slightly younger (mean ~45).  
- **Control** and **Test** clients average ~48.  
👉 The experiment seems to target slightly older clients compared to those not included.  

### Gender  
- Nearly balanced between Male and Female in the experiment groups.  
- A very large portion of **Non-Experiment** clients are labeled as “Unknown” (~70%).  
- Control and Test groups also include ~34% “Unknown”.  
👉 Gender classification has major data quality gaps, especially for Non-Experiment clients.  

### Tenure (Years with Vanguard)  
- Average ~12 years, median ~11, range 2–62.  
- Distributions are very similar across all groups.  
👉 Clients are typically long-standing; experiment assignment does not depend on tenure.  

### Balances  
- Overall mean balance ~150k, median ~63k.  
- **Control/Test** participants show slightly higher average balances ~159k–164k compared to **Non-Experiment** ~153k.  
- Distribution is strongly skewed: most clients hold moderate balances (<140k), while a wealthy minority drives the average up.  

---

### Answers to Day 2 Questions  
**Who are the primary clients?**  
Middle-aged investors (30–60 years), balanced between male and female (when data is available), long-standing ~10+ years, with moderate balances.  

**Are they younger or older, new or long-standing?**  
They are generally older and long-standing clients. The experiment slightly favors older clients with somewhat higher balances compared to those not included.  


## Outlier Analysis

In [None]:
# Outliers - Balance
Q1 = df_full["bal"].quantile(0.25)
Q3 = df_full["bal"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df_full[(df_full["bal"] < lower_bound) | (df_full["bal"] > upper_bound)]
print("Number of outliers in balance:", outliers.shape[0])
print("Outlier values summary:\n", outliers["bal"].describe())



In [None]:
# Outliers - Age
Q1 = df_full["clnt_age"].quantile(0.25)
Q3 = df_full["clnt_age"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_age = df_full[(df_full["clnt_age"] < lower_bound) | (df_full["clnt_age"] > upper_bound)]
print("Number of outliers in age:", outliers_age.shape[0])
print("Outlier values summary:\n", outliers_age["clnt_age"].describe())



In [None]:
# Outliers - Tenure (years)
Q1 = df_full["clnt_tenure_yr"].quantile(0.25)
Q3 = df_full["clnt_tenure_yr"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_tenure = df_full[(df_full["clnt_tenure_yr"] < lower_bound) | (df_full["clnt_tenure_yr"] > upper_bound)]
print("Number of outliers in tenure (years):", outliers_tenure.shape[0])
print("Outlier values summary:\n", outliers_tenure["clnt_tenure_yr"].describe())


In [None]:
# Outliers - Number of accounts
Q1 = df_full["num_accts"].quantile(0.25)
Q3 = df_full["num_accts"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_accts = df_full[(df_full["num_accts"] < lower_bound) | (df_full["num_accts"] > upper_bound)]
print("Number of outliers in number of accounts:", outliers_accts.shape[0])
print("Outlier values summary:\n", outliers_accts["num_accts"].describe())


In [None]:
variables = ["clnt_age", "clnt_tenure_yr", "num_accts", "bal"]

for var in variables:
    print(f"\n### Outlier analysis for {var}")
    for group, subset in demo_var.groupby("variation"):
        Q1 = subset[var].quantile(0.25)
        Q3 = subset[var].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = subset[(subset[var] < lower_bound) | (subset[var] > upper_bound)]

        print(f"\n===== {group} =====")
        print(f"Number of outliers in {var}: {outliers.shape[0]}")
        print(f"Outlier values summary:\n{outliers[var].describe()}")


##### Outlier Analysis  

We applied the Interquartile Range (IQR) method to detect statistical outliers across age, tenure, number of accounts, and balances, separated by **Control**, **Test**, and **Non-Experiment** groups.  

### Age (`clnt_age`)  
- No outliers detected in any group.  
- Age distribution is realistic, mostly between 30 and 60 years.  
👉 This suggests strong data quality for age with no anomalies.  

### Tenure (`clnt_tenure_yr`)  
- Control: **1,793** outliers  
- Non-Experiment: **2,013** outliers  
- Test: **2,421** outliers  
- These correspond to clients with 32–62 years of tenure.  
👉 Test group contains more very long-standing clients, but overall these cases remain a small minority.  

### Number of Accounts (`num_accts`)  
- Control: **32,716** outliers  
- Non-Experiment: **28,891** outliers  
- Test: **37,931** outliers  
- Most clients hold 2–3 accounts, but some have 6–8 accounts, which the IQR method flags as unusual.  
👉 Outliers appear in all groups at similar proportions, reflecting clients with more complex account structures rather than errors.  

### Balances (`bal`)  
- Control: **16,532** outliers  
- Non-Experiment: **14,549** outliers  
- Test: **19,950** outliers  
- Outliers average ~730k–780k, with maxima up to **8.3M** (Control), **12.8M** (Non-Experiment), and **16.3M** (Test).  
👉 Confirms a consistent high-wealth client segment across all groups, not concentrated in a single variation.  

---

### Conclusion  
- **Balances** are the most skewed variable, with ~51k clients holding exceptionally high wealth.  
- **Tenure** and **Number of Accounts** also show minority extremes, representing very long-standing or complex clients.  
- **Age** shows no anomalies.  
- Importantly, outlier patterns are consistent across **Control**, **Test**, and **Non-Experiment**, meaning they do not bias the experiment’s comparison of completion rates or performance metrics.  


### Mean Completion Time per Step

In [None]:
#1 Sort visits properly 

df_full = df_full.sort_values(["visit_id", "date_time"])

In [None]:
#1 Sort visits properly

df_full = df_full.sort_values(["visit_id", "date_time"])


In [None]:
#2 Map process steps into numeric order
step_order = {"step 1":1, "step 2":2, "step 3":3, "confirm":4}
df_full["step_order"] = df_full["process_step"].map(step_order)



In [None]:
#3 Compute time differences per visit
df_full["time_diff"] = df_full.groupby("visit_id")["date_time"].diff()


In [None]:
#4 Aggregate mean time per step and variation
mean_times = (
    df_full.groupby(["Variation", "process_step"])["time_diff"]
    .mean()
    .reset_index()
)

# Convert to seconds (optional)
mean_times["time_diff_sec"] = mean_times["time_diff"].dt.total_seconds()

print(mean_times)


In [None]:
plt.figure(figsize=(8,5))
sns.barplot(data=mean_times, x="process_step", y="time_diff_sec", hue="Variation")
plt.ylabel("Mean Time (seconds)")
plt.title("Mean Completion Time per Step: Control vs Test")
plt.show()

### Mean Completion Time per Step – Analysis  

We calculated the **average time spent on each step** of the process by both the Control and Test groups. Results (in seconds) are as follows:

| Step       | Control (sec) | Test (sec) | Observation |
|------------|---------------|------------|-------------|
| Start      | ~154.9        | ~148.9     | Test clients move slightly faster through the initial step, saving ~6 seconds. |
| Step 1     | ~43.0         | ~37.7      | Both groups complete this step quickly, but Test is faster by ~5 seconds. |
| Step 2     | ~38.7         | ~48.1      | The Test group takes ~9 seconds longer than Control, suggesting the redesign may add friction here. |
| Step 3     | ~92.9         | ~96.9      | Both groups spend similar time, with Test slightly slower (~4 seconds). |
| Confirm    | ~128.5        | ~129.2     | No meaningful difference at confirmation. |

---

#### Insights:
- ✅ **Faster start and Step 1:** Test clients progress more quickly at the beginning, suggesting the new interface helps initial orientation.  
- ⚠️ **Step 2 slowdown:** Test group spends ~9 seconds longer, which may indicate **confusion** or **extra cognitive load** introduced by the redesign.  
- ⚖️ **Step 3 and Confirm:** Differences are minimal; performance is nearly identical between groups.  
- 🔄 **Overall pattern:** Improvements early in the process (Start, Step 1) are offset by a slowdown in Step 2.  

---

#### Next Steps:
- Compare these per-step averages with **step completion/drop-off rates** to see if the Step 2 slowdown impacts conversion.  
- Conduct a **qualitative review of Step 2** in the Test interface to identify potential friction points.  
- Extend the analysis to **total completion time** (Start → Confirm) for a holistic view of efficiency.  


### Mean Completion Time (Start-Confirm)

In [None]:
# Work only with experiment participants
df_exp = df_full[df_full["Variation"].isin(["Control","Test"])].copy()

# Ensure datetime is correct and sorted
df_exp["date_time"] = pd.to_datetime(df_exp["date_time"])
df_exp = df_exp.sort_values(["visit_id", "date_time"])

# Get first and last timestamp per visit (only if they reached confirm)
visit_times = (
    df_exp.groupby(["Variation","visit_id"])
          .agg(start_time=("date_time","first"),
               end_time=("date_time","last"))
          .reset_index()
)

# Calculate duration
visit_times["completion_time"] = (visit_times["end_time"] - visit_times["start_time"]).dt.total_seconds()

# Keep only visits that actually reached "confirm"
completed_visits = df_exp[df_exp["process_step"]=="confirm"]["visit_id"].unique()
visit_times = visit_times[visit_times["visit_id"].isin(completed_visits)]

# Mean completion time per group
mean_completion = visit_times.groupby("Variation")["completion_time"].mean().round(2)
print("Mean completion time (seconds):")
print(mean_completion)


In [None]:
print(mean_completion / 60)  # mean in minutes


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert to DataFrame for seaborn
mean_df = mean_completion.reset_index()
mean_df.columns = ["variation", "completion_time"]

sns.barplot(data=mean_df, x="variation", y="completion_time", hue="variation", palette="Set2", legend=False)
plt.ylabel("Mean Completion Time (seconds)")
plt.title("Mean Completion Time: Control vs Test")
plt.show()



##### ⏱ Mean Completion Time (Start → Confirm)  

We calculated the **average completion time** from the initial step (`start`) to the final confirmation (`confirm`) for both groups.  

### Results  
- **Control group**: ~394 seconds (≈ **6.56 minutes**)  
- **Test group**: ~362 seconds (≈ **6.04 minutes**)  

### Insights  
- On average, clients in the **Test group completed the full process ~32 seconds faster** than those in the Control group.  
- While the difference is modest, it suggests that the redesigned interface **improves overall efficiency**.  
- However, further statistical testing is needed to confirm whether this observed difference is **statistically significant** or due to random variation.  

### Next Steps  
To further validate our findings, we will:  

1. **Select random clients from each group (Control & Test):**  
   - Pick 3 clients per group.  
   - Compute their individual completion times from **Start → Confirm**.  

2. **Run a Two-Sample T-Test:**  
   - Compare the average completion times between Control and Test groups.  
   - Hypotheses:  
     - **H0 (Null):** There is no difference in mean completion time between groups.  
     - **H1 (Alternative):** The Test group has a significantly different mean completion time compared to Control.  

3. **Interpret results:**  
   - A **p-value < 0.05** will indicate a significant difference.  
   - If not, the new design did not significantly change the completion time.  


## Sample Client Journey Analysis–Control vs Test

In [None]:
# Keep only experiment participants (Control and Test)
df_exp = df_full[df_full["Variation"].isin(["Control","Test"])].copy()

# Sort by visit and ensure datetime format
df_exp = df_exp.sort_values(["visit_id","date_time"]).copy()
df_exp["date_time"] = pd.to_datetime(df_exp["date_time"], errors="coerce")

print(df_exp["Variation"].value_counts())


In [None]:
# Random sample of 3 clients from each group (Control and Test)
sample_clients = (
    df_exp.groupby("Variation")["client_id"]
          .apply(lambda x: x.dropna().sample(3, random_state=42))
          .reset_index()
)

print("Random sample of clients:\n", sample_clients)


In [None]:
# Filter df_exp for only these client_ids
df_sample = df_exp[df_exp["client_id"].isin(sample_clients["client_id"])]

# Check each client’s process
for cid in sample_clients["client_id"]:
    print(f"\n=== Client {cid} ===")
    print(df_sample[df_sample["client_id"] == cid][["Variation","visit_id","process_step","date_time"]])


In [None]:
# 1) Make sure we’re on experiment participants only
# Keep only Control/Test participants
df_exp = df_full[df_full["Variation"].isin(["Control","Test"])].copy()

# Ensure sorted + datetime
df_exp = df_exp.sort_values(["visit_id","date_time"]).copy()
df_exp["date_time"] = pd.to_datetime(df_exp["date_time"], errors="coerce")


In [None]:
#2) Pick 3 random clients per group
sampled = (
    df_exp.groupby("Variation")["client_id"]
          .apply(lambda s: s.dropna().drop_duplicates().sample(3, random_state=42))
          .reset_index()
          .rename(columns={"client_id":"client_id"})
)
sample_client_ids = sampled["client_id"].tolist()
print("Sampled client_ids:", sample_client_ids)


In [None]:
#3) Keep only those clients and choose one visit per client
df_s = df_exp[df_exp["client_id"].isin(sample_client_ids)].copy()

# For each client/visit: did it reach confirm? what's the last timestamp?
vis_stats = (df_s.assign(has_confirm=(df_s["process_step"].str.lower()=="confirm"))
               .groupby(["client_id","visit_id"])
               .agg(last_time=("date_time","max"), reached_confirm=("has_confirm","max"))
               .reset_index())

# Choose per client: prioritize (reached_confirm=True), then by latest time
choice = (vis_stats.sort_values(["client_id","reached_confirm","last_time"])
                   .groupby("client_id")
                   .tail(1)[["client_id","visit_id"]])

# Keep only the chosen visit per client
df_s = df_s.merge(choice, on=["client_id","visit_id"], how="inner").copy()
df_s = df_s.sort_values(["client_id","visit_id","date_time"])


In [None]:
#4) Compute per-step durations (no apply, no warnings)
# Duration until the next step within each visit
df_s["next_time"] = df_s.groupby(["client_id","visit_id"])["date_time"].shift(-1)
df_s["step_duration"] = (df_s["next_time"] - df_s["date_time"]).dt.total_seconds()

# Seconds from visit start (for left offset on the timeline)
df_s["visit_t0"] = df_s.groupby(["client_id","visit_id"])["date_time"].transform("min")
df_s["t_rel"] = (df_s["date_time"] - df_s["visit_t0"]).dt.total_seconds()

# Drop the last row of each visit (no duration after the final step)
df_steps = df_s.dropna(subset=["step_duration"]).copy()
df_steps.head()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Keep only experiment participants
df_exp = df_full[df_full["Variation"].isin(["Control","Test"])].copy()

# Normalize steps + datetime + sort
df_exp["process_step"] = (
    df_exp["process_step"].astype(str).str.strip().str.lower()
      .replace({"step1":"step 1","step_1":"step 1",
                "step2":"step 2","step_2":"step 2",
                "step3":"step 3","step_3":"step 3",
                "confirmation":"confirm"})
)
df_exp["date_time"] = pd.to_datetime(df_exp["date_time"], errors="coerce")
df_exp = df_exp.sort_values(["visit_id","date_time"])

# Standard step order (use only those present in your data)
all_steps = ["start","step 1","step 2","step 3","confirm"]
steps = [s for s in all_steps if s in df_exp["process_step"].unique()]



In [None]:
def funnel_counts(df, steps):
    """
    Returns a DataFrame with counts of unique visits that reached each step,
    for every variation (Control/Test).
    """
    # Has step? -> per visit
    has_step = (df.drop_duplicates(["visit_id","process_step"])
                  .groupby(["Variation","process_step"])["visit_id"]
                  .nunique()
                  .rename("visits")
                  .reset_index())

    # Ensure all steps appear per variation (fill 0 if missing)
    variations = has_step["Variation"].unique().tolist()
    idx = pd.MultiIndex.from_product([variations, steps], names=["Variation","process_step"])
    has_step = has_step.set_index(["Variation","process_step"]).reindex(idx, fill_value=0).reset_index()
    return has_step

funnel_df = funnel_counts(df_exp, steps)
funnel_df


In [None]:
conv = (funnel_df.pivot(index="Variation", columns="process_step", values="visits")
                  .fillna(0))
if "start" in conv.columns and "confirm" in conv.columns:
    conv["conversion_rate"] = 100 * conv["confirm"] / conv["start"]
    print(conv[["start","confirm","conversion_rate"]].round(2))


In [None]:
# Prepare left (Control) and right (Test) data
plot_df = funnel_df.pivot(index="process_step", columns="Variation", values="visits").reindex(steps)
control = plot_df.get("Control", pd.Series(0, index=steps)).fillna(0)
test    = plot_df.get("Test",    pd.Series(0, index=steps)).fillna(0)

# Scale bars to look like funnels (relative to start of each side)
def scale_for_funnel(series):
    maxv = series.max() if series.max() > 0 else 1
    return series / maxv

ctrl_scaled = scale_for_funnel(control)
test_scaled = scale_for_funnel(test)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

y = np.arange(len(steps))[::-1]  # top = start
bar_height = 0.35

# Left (Control): draw to the left using negative widths
ax.barh(y - bar_height/2, -ctrl_scaled.values, height=bar_height, align="center")
# Right (Test)
ax.barh(y + bar_height/2,  test_scaled.values, height=bar_height, align="center")

# Annotations: actual counts
for i, s in enumerate(steps[::-1]):
    # Control value on the left
    ax.text(-ctrl_scaled.iloc[::-1].values[i] - 0.02, y[i] - bar_height/2,
            f"{int(control.iloc[::-1].values[i])}", va="center", ha="right", fontsize=9)
    # Test value on the right
    ax.text( test_scaled.iloc[::-1].values[i] + 0.02, y[i] + bar_height/2,
            f"{int(test.iloc[::-1].values[i])}", va="center", ha="left", fontsize=9)

# Center step labels
ax.set_yticks(y)
ax.set_yticklabels(steps[::-1])
ax.set_xlabel("Relative width (scaled)")
ax.set_title("Funnel – Unique visits reaching each step (Control vs Test)")

# Center divider
ax.axvline(0, color="black", linewidth=1)

# Legends (simple text)
ax.text(-1.0, y[0] + 0.6, "Control", fontsize=10, ha="left")
ax.text( 0.8, y[0] + 0.6, "Test",    fontsize=10, ha="right")

# Tidy limits
ax.set_xlim(-1.05, 1.05)
plt.tight_layout()
plt.show()


In [None]:
# Time between steps per visit
df_exp = df_exp.sort_values(["visit_id","date_time"]).copy()
df_exp["time_diff"] = df_exp.groupby("visit_id")["date_time"].diff()
df_exp["time_diff_sec"] = df_exp["time_diff"].dt.total_seconds()

mean_time = (df_exp.groupby(["Variation","process_step"])["time_diff_sec"]
                   .mean()
                   .reset_index())

# Bar chart
fig, ax = plt.subplots(figsize=(8,5))
# Order steps on x
xticks = steps
for i, var in enumerate(["Control","Test"]):
    sub = mean_time[mean_time["Variation"]==var].set_index("process_step").reindex(xticks)
    x = np.arange(len(xticks)) + (i-0.5)*0.35
    ax.bar(x, sub["time_diff_sec"].values, width=0.35, label=var)

ax.set_xticks(np.arange(len(xticks)))
ax.set_xticklabels(xticks, rotation=0)
ax.set_ylabel("Mean time (seconds)")
ax.set_title("Mean time between steps – Control vs Test")
ax.legend()
plt.tight_layout()
plt.show()


In [None]:
print(mean_completion / 60)  # mean in minutes



##### 🎯 Funnel Analysis – Control vs Test  

We analyzed client progression through each step of the online process.  

### Step-by-Step Funnel (Number of Clients)  
| Step       | Control | Test   |
|------------|---------|--------|
| Start      | 30,842  | 33,109 |
| Step 1     | 23,486  | 28,238 |
| Step 2     | 20,077  | 24,464 |
| Step 3     | 18,242  | 22,149 |
| Confirm    | 15,988  | 21,692 |

### Conversion Rate (Start → Confirm)  
- **Control:** 51.8%  
- **Test:** 65.5%  

### Completion Time (Start → Confirm, in minutes)  
- **Control:** 6.56 min (~394 sec)  
- **Test:** 6.04 min (~362 sec)  

---

### Insights  
- ✅ **Higher retention at every step:** The Test group consistently has more clients advancing to each step, showing improved engagement throughout the funnel.  
- 📈 **Stronger overall conversion:** The redesigned interface increased completion rates by ~14 percentage points (from 52% → 66%).  
- ⏱ **Faster process:** Test clients complete the entire flow ~32 seconds faster on average, reinforcing that the redesign improves efficiency.  
- ⚖️ **Balanced trade-off:** Although Test takes slightly longer at *Step 2*, they make up for it by retaining significantly more users to the final confirmation.  

---

### Conclusion  
The funnel analysis demonstrates that the redesigned interface:  
- Improves **user retention** across all steps.  
- Achieves a **higher overall conversion rate**.  
- Delivers **faster completion times**.  

Together, these results suggest that the Test variation offers a **more effective and user-friendly process**, with clear business impact.  



## 📊 Two-Sample T-Test for Completion Times

In [None]:

from scipy import stats

# Work only with experiment participants
df_exp = df_full[df_full["Variation"].isin(["Control","Test"])].copy()
df_exp["date_time"] = pd.to_datetime(df_exp["date_time"])
df_exp = df_exp.sort_values(["visit_id","date_time"])

# Get duration Start -> Confirm per visit
visit_times = (
    df_exp.groupby(["Variation","visit_id"])
    .agg(start=("date_time","first"), end=("date_time","last"))
    .reset_index()
)
visit_times["completion_time"] = (visit_times["end"] - visit_times["start"]).dt.total_seconds()

# Keep only visits that actually reached "confirm"
completed_visits = df_exp[df_exp["process_step"].str.lower()=="confirm"]["visit_id"].unique()
visit_times = visit_times[visit_times["visit_id"].isin(completed_visits)]

# Separate by group
control_times = visit_times.loc[visit_times["Variation"]=="Control","completion_time"]
test_times = visit_times.loc[visit_times["Variation"]=="Test","completion_time"]

# Two-sample t-test
t_stat, p_val = stats.ttest_ind(test_times, control_times, equal_var=False)

print("Mean Completion Time (Control):", control_times.mean()/60, "minutes")
print("Mean Completion Time (Test):", test_times.mean()/60, "minutes")
print("t-statistic:", t_stat, "p-value:", p_val)



## Two-Sample T-Test for Completion Times

We conducted a two-sample t-test to evaluate whether the new interface (Test group) led to a statistically significant reduction in mean completion times compared to the traditional interface (Control group).

**Hypotheses**  
- **H₀ (Null):** There is no difference in mean completion time between Control and Test groups.  
- **H₁ (Alternative):** The Test group has a significantly different mean completion time compared to Control.  

**Results**  
- Mean Completion Time (Control): **6.57 minutes**  
- Mean Completion Time (Test): **6.04 minutes**  
- t-statistic: **-5.96**  
- p-value: **2.53e-09**

**Interpretation**  
Since the p-value is far below 0.05, we reject the null hypothesis (**H₀**) and accept the alternative hypothesis (**H₁**).  
This confirms that the difference in mean completion time between the Control and Test groups is **statistically significant**.  

**Conclusion**  
Users in the Test group complete the process faster than those in the Control group.  
The redesigned interface improves overall efficiency, reducing completion time by approximately **0.5 minutes per user**.


## Balance Analysis: Do Clients with Higher Balances Interact More and Make Fewer Errors?  

### Objective  
In this section, we investigate whether clients with larger account balances behave differently during the digital process.  
Specifically, we want to determine if:  
1. Clients with higher balances **interact more** (i.e., complete more steps/visits).  
2. Clients with higher balances are **more careful** (i.e., make fewer backward navigation errors).  

### Hypotheses  

- **H₀ (Null Hypothesis):**  
  Account balance has no relationship with client interactions or error rates.  

- **H₁ (Alternative Hypothesis):**  
  Clients with higher balances complete more steps and commit fewer errors.  

### Approach  
- **Interaction** will be measured by the number of unique steps completed per client.  
- **Errors** will be measured as backward navigation (cases where a client goes back to a previous step).  
- Both metrics will be compared against account balances using correlations and visualizations.  

This analysis will help us understand if wealthier clients tend to engage more carefully with the process, which could influence the overall interpretation of the A/B test results.  


In [None]:
# Work only with experiment participants
df_exp = df_full[df_full["Variation"].isin(["Control", "Test"])].copy()

# Calculate interactions: number of steps completed per client
interactions = (
    df_exp.groupby("client_id")["process_step"]
    .nunique()
    .reset_index()
    .rename(columns={"process_step": "n_steps"})
)

# Calculate errors: backward navigation (if step order decreases)
df_exp = df_exp.sort_values(["client_id", "visit_id", "date_time"])
df_exp["step_order"] = df_exp["process_step"].map({"start":0,"step 1":1,"step 2":2,"step 3":3,"confirm":4})

df_exp["backward"] = df_exp.groupby("client_id")["step_order"].diff().lt(0).astype(int)

errors = (
    df_exp.groupby("client_id")["backward"]
    .sum()
    .reset_index()
    .rename(columns={"backward":"n_errors"})
)

# Merge with balance
balance_analysis = (
    df_full[["client_id","bal"]]
    .merge(interactions, on="client_id", how="left")
    .merge(errors, on="client_id", how="left")
)

balance_analysis["n_errors"] = balance_analysis["n_errors"].fillna(0)

balance_analysis.head()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation
print(balance_analysis[["bal","n_steps","n_errors"]].corr())

# Visuals
plt.figure(figsize=(8,5))
sns.scatterplot(data=balance_analysis, x="bal", y="n_steps", alpha=0.3)
plt.title("Balance vs Number of Steps Completed")
plt.show()

plt.figure(figsize=(8,5))
sns.scatterplot(data=balance_analysis, x="bal", y="n_errors", alpha=0.3, color="red")
plt.title("Balance vs Number of Errors (Backward Steps)")
plt.show()


## Balance Analysis – Results  

### Correlation Findings  
- The correlation between **balance and steps completed** is extremely weak (~0.00005).  
- The correlation between **balance and errors** is also very weak (~0.058).  
👉 This means account balances are **not strong predictors** of interaction or error behavior.  

### Visual Insights  
- **Balance vs. Steps Completed:**  
  - Most clients, regardless of balance, either complete all steps (n_steps = 5) or drop off early.  
  - Even very high-balance clients are spread across the same step completion levels as lower-balance clients.  
  - No clear upward trend indicates that higher balances do **not lead to more completed steps**.  

- **Balance vs. Errors (Backward Steps):**  
  - The majority of clients, independent of balance, commit **0–1 errors**.  
  - Some outliers exist (clients with multiple backward steps), but they are not concentrated among high-balance clients.  
  - This suggests that **wealthier clients are not necessarily more careful** in avoiding mistakes.  

### Interpretation  
- **Hypotheses check:**  
  - **H₀ (Null Hypothesis)** cannot be rejected.  
  - There is **no meaningful relationship** between client balances and how carefully or thoroughly clients interact with the process.  

- **Business implication:**  
  - Interaction quality and error rates appear to depend on **other factors** (e.g., age, tenure, or UI design) rather than financial wealth.  
  - This reinforces that the A/B test differences between Control and Test are likely due to the interface changes, not balance-driven behavior.


## Error Distribution Analysis  

### Objective  
After exploring correlations, we now investigate how errors are distributed across clients with different account balances.  
Instead of looking only at linear relationships, we will analyze whether clients with **higher balances** systematically make fewer errors compared to clients with **lower balances**.  

### Approach  
- Split clients into groups based on account balance (e.g., quartiles).  
- Compare the **average number of errors** across these groups.  
- Visualize the error distribution using boxplots or histograms.  

### Hypotheses  
- **H₀ (Null Hypothesis):**  
  There is no difference in error distributions across clients with different balances.  

- **H₁ (Alternative Hypothesis):**  
  Clients with higher balances make fewer errors than those with lower balances.  

This analysis will help us understand whether errors are concentrated in specific balance groups or evenly spread across all clients.  


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Copy dataframe to avoid modifying original
df_quartiles = balance_analysis.copy()

# 1. Create balance groups (quartiles)
df_quartiles["balance_group"] = pd.qcut(
    df_quartiles["bal"], 
    4, 
    labels=["Low", "Mid-Low", "Mid-High", "High"]
)

# 2. Calculate average errors per group
error_summary = df_quartiles.groupby("balance_group", observed=False)["n_errors"].mean()

print("Average number of errors by balance group:")
print(error_summary)

# 3. Boxplot visualization
plt.figure(figsize=(8,5))
sns.boxplot(
    data=df_quartiles, 
    x="balance_group", 
    y="n_errors", 
    hue="balance_group", 
    palette="Set2", 
    legend=False
)

plt.title("Error Distribution Across Balance Groups")
plt.xlabel("Balance Group")
plt.ylabel("Number of Errors")
plt.show()

# 4. Histogram (optional, to see distribution overlap)
plt.figure(figsize=(10,6))
sns.histplot(data=df_quartiles, x="n_errors", hue="balance_group", multiple="stack", bins=20)
plt.title("Error Frequency by Balance Group")
plt.xlabel("Number of Errors")
plt.ylabel("Count of Clients")
plt.show()


## Error Distribution Analysis Across Balance Groups  

### Objective  
In this section, we analyze whether clients with **different account balances** show meaningful differences in the **number of errors** (backward steps) made during the process.  

### Results  
- **Average Errors by Balance Group:**  
  - Low Balance: ~0.09 errors  
  - Mid-Low Balance: ~0.14 errors  
  - Mid-High Balance: ~0.16 errors  
  - High Balance: ~0.20 errors  

- **Visual Findings:**  
  - Most clients, regardless of balance, made **0 errors**, as shown in the frequency distribution.  
  - Clients in higher balance groups show a **slightly higher average number of errors**, but the increase is modest.  
  - Boxplots indicate a wider spread of errors in higher balance groups, with some clients committing up to 6 backward steps.  

### Interpretation  
- Contrary to the initial hypothesis, **higher balances do not clearly lead to fewer errors**.  
- In fact, wealthier clients appear to make **slightly more backward steps**, suggesting that higher engagement with the process (possibly due to higher stakes or more careful review) may lead to more corrections rather than fewer mistakes.  
- The effect, however, is small in magnitude — most clients across all balance groups show no errors at all.  

### Next Steps  
To validate whether the differences are statistically significant, we can run:  
1. **ANOVA or Kruskal-Wallis Test** to compare error distributions across balance quartiles.  
2. Post-hoc tests if significance is found, to identify which groups differ.  


In [None]:
from scipy import stats

# ANOVA test (parametric)
anova_result = stats.f_oneway(
    df_quartiles[df_quartiles["balance_group"]=="Low"]["n_errors"],
    df_quartiles[df_quartiles["balance_group"]=="Mid-Low"]["n_errors"],
    df_quartiles[df_quartiles["balance_group"]=="Mid-High"]["n_errors"],
    df_quartiles[df_quartiles["balance_group"]=="High"]["n_errors"]
)

print("ANOVA Test:")
print("F-statistic:", anova_result.statistic)
print("p-value:", anova_result.pvalue)

# Kruskal-Wallis test (non-parametric)
kruskal_result = stats.kruskal(
    df_quartiles[df_quartiles["balance_group"]=="Low"]["n_errors"],
    df_quartiles[df_quartiles["balance_group"]=="Mid-Low"]["n_errors"],
    df_quartiles[df_quartiles["balance_group"]=="Mid-High"]["n_errors"],
    df_quartiles[df_quartiles["balance_group"]=="High"]["n_errors"]
)

print("\nKruskal-Wallis Test:")
print("H-statistic:", kruskal_result.statistic)
print("p-value:", kruskal_result.pvalue)


## Statistical Test: Do Error Rates Differ by Balance Group?

### Objective  
We tested whether the average number of navigation errors (backward steps) differs significantly across balance groups (Low, Mid-Low, Mid-High, High).

### Methods  
- **ANOVA Test (parametric):** Compares means assuming normally distributed errors.  
- **Kruskal-Wallis Test (non-parametric):** Compares medians/distributions without assuming normality (more robust for skewed/error data).  

### Results  
- **ANOVA Test:**  
  - F-statistic ≈ 1402.27  
  - p-value = 0.0  
- **Kruskal-Wallis Test:**  
  - H-statistic ≈ 3782.06  
  - p-value = 0.0  

👉 Both tests strongly reject the **Null Hypothesis (H₀)** that error rates are the same across all balance groups.  

### Interpretation  
- The differences in error rates between balance groups are **statistically significant**.  
- Clients with higher balances actually tend to make **more errors on average**, contradicting the initial expectation that wealthier clients would be more careful.  
- This suggests that higher-balance clients may be more **cautious and engaged**, leading to more frequent backtracking in the process.  

### Next Step  
- Conduct **post-hoc tests** (e.g., Tukey’s HSD or pairwise Mann-Whitney tests) to determine **which specific balance groups differ** from each other.  
- This will clarify whether the increase in errors is progressive with balance or concentrated between specific segments.
