In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Loading the housing price dataset (assuming the file name is "housing_price.csv")
df = pd.read_csv('https://raw.githubusercontent.com/KLoVi/vanguard-ab-test/main/Datasets_cleaned/df_pt_1%262_ABC.csv')

In [22]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,year,month,day,time,Variation,time_diff,time_diff_seconds
106314,107971,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,2017,4,26,13:22:17,Test,NaT,
106313,107970,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,2017,4,26,13:23:09,Test,0 days 00:00:52,52.0
235345,239141,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,2017,4,9,16:20:56,Test,NaT,
235344,239140,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,2017,4,9,16:21:12,Test,0 days 00:00:16,16.0
235343,239139,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,2017,4,9,16:21:21,Test,0 days 00:00:09,9.0


In [19]:

# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Calculate time differences for each visit_id
df['time_diff'] = df.groupby(['visit_id', 'Variation'])['date_time'].diff()

# Convert time differences to total seconds
df['time_diff_seconds'] = df['time_diff'].dt.total_seconds()

# Function to calculate the average duration for transitions within each Variation group
def calculate_average_durations(group_df):
    transitions = {
        'start_to_step_1': [],
        'step_1_to_step_2': [],
        'step_2_to_step_3': [],
        'step_3_to_confirm': []
    }
    
    for visit_id, group in group_df.groupby('visit_id'):
        # Ensure the group is sorted by date_time
        group = group.sort_values('date_time')
        
        # Iterate over each step in the order
        for step_from, step_to in zip(step_order[:-1], step_order[1:]):
            if step_from in group['process_step'].values and step_to in group['process_step'].values:
                from_time = group[group['process_step'] == step_from]['date_time'].values[0]
                to_time = group[group['process_step'] == step_to]['date_time'].values[0]
                time_diff_seconds = (to_time - from_time).astype('timedelta64[s]').astype(float)
                
                # Ensure time difference is positive
                if time_diff_seconds > 0:
                    transition_name = f'{step_from}_to_{step_to}'
                    transitions[transition_name].append(time_diff_seconds)
    
    # Calculate average durations for each transition
    average_durations = {key: (sum(value) / len(value) if len(value) > 0 else 0) for key, value in transitions.items()}
    
    return average_durations

# Split the DataFrame by Variation and calculate the average durations
results = {}
for variation, group in df.groupby('Variation'):
    results[variation] = calculate_average_durations(group)

# Output the results
for variation, avg_durations in results.items():
    print(f"Variation: {variation}")
    average_durations_df = pd.DataFrame(list(avg_durations.items()), columns=['Transition', 'Average Duration (seconds)'])
    print(average_durations_df)
    print("\n")

Variation: Control
          Transition  Average Duration (seconds)
0    start_to_step_1                   64.306905
1   step_1_to_step_2                   50.576086
2   step_2_to_step_3                  110.146508
3  step_3_to_confirm                  168.500918


Variation: Test
          Transition  Average Duration (seconds)
0    start_to_step_1                   52.481851
1   step_1_to_step_2                   76.060627
2   step_2_to_step_3                  107.843462
3  step_3_to_confirm                  143.410847




In [None]:
##### Good above ####

In [27]:

# Ensure 'Variation' column exists
if 'Variation' not in df.columns:
    raise KeyError("'Variation' column not found in the DataFrame.")

# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps
def calculate_backward_steps(group):
    backward_steps = 0
    steps = group['process_step'].values
    
    for i in range(1, len(steps)):
        if steps[i] != steps[i-1] and step_map[steps[i]] < step_map[steps[i-1]]:
            backward_steps += 1
    
    return backward_steps

# Apply the function to each group of visit_id
backward_steps = valid_visits.groupby('visit_id').apply(calculate_backward_steps).reset_index(name='backward_steps')

# Sum the total number of backward steps
total_backward_steps = backward_steps['backward_steps'].sum()

# Output the total number of backward steps
print(f"Total number of backward steps for all visit_id: {total_backward_steps}")


Total number of backward steps for all visit_id: 25805


In [28]:

# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps
def calculate_backward_steps(group):
    backward_steps = 0
    steps = group['process_step'].values
    
    for i in range(1, len(steps)):
        if steps[i] != steps[i-1] and step_map[steps[i]] < step_map[steps[i-1]]:
            backward_steps += 1
    
    return backward_steps

# Apply the function to each group of visit_id and Variation
backward_steps = valid_visits.groupby(['Variation', 'visit_id']).apply(calculate_backward_steps).reset_index(name='backward_steps')

# Sum the total number of backward steps for each Variation
total_backward_steps_by_variation = backward_steps.groupby('Variation')['backward_steps'].sum().reset_index()

# Output the total number of backward steps for each Variation
print("Total number of backward steps by Variation:")
print(total_backward_steps_by_variation)

Total number of backward steps by Variation:
  Variation  backward_steps
0   Control            9504
1      Test           16194


In [29]:

# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps and total steps
def calculate_steps(group):
    total_steps = 0
    backward_steps = 0
    
    steps = group['process_step'].values
    for i in range(1, len(steps)):
        if steps[i] != steps[i-1]:
            total_steps += 1
            if step_map[steps[i]] < step_map[steps[i-1]]:
                backward_steps += 1
    
    return total_steps, backward_steps

# Apply the function to each group of Variation and visit_id
results = valid_visits.groupby(['Variation', 'visit_id']).apply(calculate_steps).reset_index(name='steps')

# Split the total_steps and backward_steps into separate columns
results[['total_steps', 'backward_steps']] = pd.DataFrame(results['steps'].tolist(), index=results.index)

# Sum the total steps and backward steps for each Variation
total_steps_by_variation = results.groupby('Variation')['total_steps'].sum().reset_index(name='total_steps')
backward_steps_by_variation = results.groupby('Variation')['backward_steps'].sum().reset_index(name='backward_steps')

# Merge the two DataFrames
merged_results = pd.merge(total_steps_by_variation, backward_steps_by_variation, on='Variation')

# Calculate the error rates
merged_results['error_rate'] = merged_results['backward_steps'] / merged_results['total_steps']

# Output the results
print("Error rates by Variation:")
print(merged_results)


Error rates by Variation:
  Variation  total_steps  backward_steps  error_rate
0   Control        93502            9504    0.101645
1      Test       119375           16194    0.135657
