In [1]:
# === Setup: libraries and config ===
import os
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yaml

# Load config.yaml
CONFIG_PATH = Path("../../config.yaml").resolve()

try:
    with CONFIG_PATH.open("r", encoding="utf-8") as f:
        config = yaml.safe_load(f)
    print(f"Config loaded successfully from: {CONFIG_PATH}")
except Exception as e:
    raise FileNotFoundError(f"config.yaml not found at {CONFIG_PATH}") from e

print("Top-level keys:", list(config.keys()))

# === Load clean dataframe (df_full) ===
clean_cfg = config.get("data", {}).get("clean", {})
if "df_full" not in clean_cfg:
    raise KeyError("'df_full' is not defined in config['data']['clean']")

# --- Resolve path relative to config.yaml parent, stripping leading ../ ---
yaml_path = Path(clean_cfg["df_full"])
if yaml_path.is_absolute():
    path = yaml_path
else:
    parts = list(yaml_path.parts)
    while parts and parts[0] in ("..", "."):
        parts = parts[1:]
    rel = Path(*parts)  # e.g., "data/clean_data_txt/df_full.pkl"
    path = (CONFIG_PATH.parent / rel).resolve()

print("Resolved df_full path:", path)

if not path.exists():
    raise FileNotFoundError(f"File not found at: {path}")

df_full = pd.read_pickle(path)
print(f"df_full loaded successfully: {df_full.shape}")
display(df_full.head())

Config loaded successfully from: C:\Users\Kinga\Desktop\vanguard-ab-test\config.yaml
Top-level keys: ['data']
Resolved df_full path: C:\Users\Kinga\Desktop\vanguard-ab-test\data\clean_data_txt\df_full.pkl
df_full loaded successfully: (317235, 14)


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,client_tenure_years,client_tenure_months,age,gender,number_of_accounts,balance,calls_6_months,logons_6_months
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test,5,64,79.0,U,2,189023.86,1,4
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test,5,64,79.0,U,2,189023.86,1,4
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test,5,64,79.0,U,2,189023.86,1,4
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test,5,64,79.0,U,2,189023.86,1,4
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test,5,64,79.0,U,2,189023.86,1,4


In [3]:
from datetime import datetime, timedelta

In [5]:
# Calculate time spent per step
df_full["next_time"] = df_full.groupby("visit_id")["date_time"].shift(-1)
df_full["step_duration_s"] = (df_full["next_time"] - df_full["date_time"]).dt.total_seconds()

In [6]:
df_full

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,client_tenure_years,client_tenure_months,age,gender,number_of_accounts,balance,calls_6_months,logons_6_months,next_time,step_duration_s
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test,5,64,79.0,U,2,189023.86,1,4,2017-04-17 15:26:51,-16.0
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test,5,64,79.0,U,2,189023.86,1,4,2017-04-17 15:19:22,-449.0
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test,5,64,79.0,U,2,189023.86,1,4,2017-04-17 15:19:13,-9.0
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test,5,64,79.0,U,2,189023.86,1,4,2017-04-17 15:18:04,-69.0
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test,5,64,79.0,U,2,189023.86,1,4,2017-04-17 15:17:15,-49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317230,1574008,117364417_77840596075,528720790_71583064618_169151,start,2017-05-06 23:43:27,Test,10,121,55.0,U,2,153238.83,3,6,NaT,
317231,2908510,814969699_90652851448,562606085_36368381773_92090,start,2017-05-10 22:57:17,Control,21,252,34.0,M,3,141808.05,6,9,2017-05-10 22:56:31,-46.0
317232,2908510,814969699_90652851448,562606085_36368381773_92090,step_2,2017-05-10 22:56:31,Control,21,252,34.0,M,3,141808.05,6,9,2017-05-10 22:56:23,-8.0
317233,2908510,814969699_90652851448,562606085_36368381773_92090,step_1,2017-05-10 22:56:23,Control,21,252,34.0,M,3,141808.05,6,9,2017-05-10 22:56:20,-3.0


In [7]:
# Flag errors and completions
df_full["is_error"] = df_full["process_step"].str.contains("error", case=False)
visit_summary = df_full.groupby("visit_id").agg({
    "variation": "first",
    "client_id": "first",
    "is_error": "max",
    "process_step": lambda x: "finish" in list(x),
    "date_time": ["min", "max"]
})
visit_summary.columns = ["variation", "client_id", "has_error", "has_finish", "start_time", "end_time"]
visit_summary = visit_summary.reset_index()

In [8]:
visit_summary

Unnamed: 0,visit_id,variation,client_id,has_error,has_finish,start_time,end_time
0,100012776_37918976071_457913,Test,3561384,False,False,2017-04-26 13:22:17,2017-04-26 13:23:09
1,100019538_17884295066_43909,Test,7338123,False,False,2017-04-09 16:20:56,2017-04-09 16:24:58
2,100022086_87870757897_149620,Test,2478628,False,False,2017-05-23 20:44:01,2017-05-23 20:47:01
3,100030127_47967100085_936361,Control,105007,False,False,2017-03-22 11:07:49,2017-03-22 11:07:49
4,100037962_47432393712_705583,Control,5623007,False,False,2017-04-14 16:41:51,2017-04-14 16:44:03
...,...,...,...,...,...,...,...
69200,999971096_28827267783_236076,Test,2979920,False,False,2017-04-13 10:31:49,2017-04-13 10:34:08
69201,999976049_95772503197_182554,Test,4449968,False,False,2017-04-04 12:50:10,2017-04-04 13:02:18
69202,999984454_18731538378_781808,Test,829911,False,False,2017-03-29 11:18:33,2017-03-29 11:21:07
69203,999985675_64610694964_443659,Control,4064969,False,False,2017-04-20 09:45:18,2017-04-20 10:02:48


In [9]:
# Compute total visit time
visit_summary["total_time_s"] = (visit_summary["end_time"] - visit_summary["start_time"]).dt.total_seconds()
visit_summary["completed"] = visit_summary["has_finish"].astype(int)
visit_summary["errored"] = visit_summary["has_error"].astype(int)

In [10]:
visit_summary

Unnamed: 0,visit_id,variation,client_id,has_error,has_finish,start_time,end_time,total_time_s,completed,errored
0,100012776_37918976071_457913,Test,3561384,False,False,2017-04-26 13:22:17,2017-04-26 13:23:09,52.0,0,0
1,100019538_17884295066_43909,Test,7338123,False,False,2017-04-09 16:20:56,2017-04-09 16:24:58,242.0,0,0
2,100022086_87870757897_149620,Test,2478628,False,False,2017-05-23 20:44:01,2017-05-23 20:47:01,180.0,0,0
3,100030127_47967100085_936361,Control,105007,False,False,2017-03-22 11:07:49,2017-03-22 11:07:49,0.0,0,0
4,100037962_47432393712_705583,Control,5623007,False,False,2017-04-14 16:41:51,2017-04-14 16:44:03,132.0,0,0
...,...,...,...,...,...,...,...,...,...,...
69200,999971096_28827267783_236076,Test,2979920,False,False,2017-04-13 10:31:49,2017-04-13 10:34:08,139.0,0,0
69201,999976049_95772503197_182554,Test,4449968,False,False,2017-04-04 12:50:10,2017-04-04 13:02:18,728.0,0,0
69202,999984454_18731538378_781808,Test,829911,False,False,2017-03-29 11:18:33,2017-03-29 11:21:07,154.0,0,0
69203,999985675_64610694964_443659,Control,4064969,False,False,2017-04-20 09:45:18,2017-04-20 10:02:48,1050.0,0,0


In [11]:
# Compute KPIs 
ab_metrics = visit_summary.groupby("variation").agg({
    "visit_id": "count",
    "completed": "mean",
    "errored": "mean",
    "total_time_s": "mean"
}).rename(columns={
    "visit_id": "visits_count",
    "completed": "completion_rate",
    "errored": "error_rate",
    "total_time_s": "avg_time_s"
}).reset_index()

print("\n=== A/B Summary Metrics ===")
print(ab_metrics)


=== A/B Summary Metrics ===
  variation  visits_count  completion_rate  error_rate  avg_time_s
0   Control         32135              0.0         0.0  283.578746
1      Test         37070              0.0         0.0  317.596223


In [13]:
# Step-Level Analysis 
step_time = df_full.groupby(["variation", "process_step"])["step_duration_s"].mean().reset_index()

In [14]:
step_time

Unnamed: 0,variation,process_step,step_duration_s
0,Control,confirm,-125.950745
1,Control,start,-103.086781
2,Control,step_1,-42.28687
3,Control,step_2,-38.482478
4,Control,step_3,-92.900217
5,Test,confirm,-122.552738
6,Test,start,-83.391873
7,Test,step_1,-36.178098
8,Test,step_2,-47.14928
9,Test,step_3,-95.909325


In [16]:
# 6. Funnel Conversion
funnel = df_full.groupby(["variation", "process_step"])["visit_id"].nunique().reset_index()
total_visits = df_full.groupby("variation")["visit_id"].nunique().reset_index(name="total_visits")
funnel = funnel.merge(total_visits, on="variation")
funnel["reach_rate"] = funnel["visit_id"] / funnel["total_visits"]

In [17]:
funnel

Unnamed: 0,variation,process_step,visit_id,total_visits,reach_rate
0,Control,confirm,16046,32189,0.498493
1,Control,start,30910,32189,0.960266
2,Control,step_1,23548,32189,0.731554
3,Control,step_2,20138,32189,0.625617
4,Control,step_3,18300,32189,0.568517
5,Test,confirm,21731,37136,0.585173
6,Test,start,33157,37136,0.892853
7,Test,step_1,28285,37136,0.76166
8,Test,step_2,24503,37136,0.659818
9,Test,step_3,22186,37136,0.597426


In [None]:
# Export cleaned data 
df.to_csv("df_cleaned_events.csv", index=False)
visit_summary.to_csv("df_visits_summary.csv", index=False)
step_time.to_csv("df_step_time.csv", index=False)
funnel.to_csv("df_funnel.csv", index=False)
