In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Loading the housing price dataset (assuming the file name is "housing_price.csv")
df = pd.read_csv('https://raw.githubusercontent.com/KLoVi/vanguard-ab-test/main/Datasets_cleaned/df_pt_1%262_ABC.csv')

In [2]:
df.head(5)
df.drop(columns = ["Unnamed: 0"], inplace = True)
df.nunique()

client_id        50500
visitor_id       56011
visit_id         69205
process_step         5
date_time       283344
year                 1
month                4
day                 31
time             69704
Variation            2
dtype: int64

In [29]:
# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps and total steps
def calculate_steps(group):
    total_steps = 0
    backward_steps = 0
    
    steps = group['process_step'].values
    for i in range(1, len(steps)):
        if steps[i] != steps[i-1]:
            total_steps += 1
            if step_map[steps[i]] < step_map[steps[i-1]]:
                backward_steps += 1
    
    return total_steps, backward_steps

# Apply the function to each group of Variation and visit_id
results = valid_visits.groupby(['Variation', 'visit_id']).apply(calculate_steps).reset_index(name='steps')

# Split the total_steps and backward_steps into separate columns
results[['total_steps', 'backward_steps']] = pd.DataFrame(results['steps'].tolist(), index=results.index)

# Sum the total steps and backward steps for each Variation
total_steps_by_variation = results.groupby('Variation')['total_steps'].sum().reset_index(name='total_steps')
backward_steps_by_variation = results.groupby('Variation')['backward_steps'].sum().reset_index(name='backward_steps')

# Merge the two DataFrames
merged_results = pd.merge(total_steps_by_variation, backward_steps_by_variation, on='Variation')

# Calculate the error rates
merged_results['error_rate'] = merged_results['backward_steps'] / merged_results['total_steps']

# Output the results
print("Error rates by Variation:")
print(merged_results)


Error rates by Variation:
  Variation  total_steps  backward_steps  error_rate
0   Control        93502            9504    0.101645
1      Test       119375           16194    0.135657


In [None]:
# STEP BY STEP

In [3]:

# Ensure 'Variation' column exists
if 'Variation' not in df.columns:
    raise KeyError("'Variation' column not found in the DataFrame.")

# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps
def calculate_backward_steps(group):
    backward_steps = 0
    steps = group['process_step'].values
    
    for i in range(1, len(steps)):
        if steps[i] != steps[i-1] and step_map[steps[i]] < step_map[steps[i-1]]:
            backward_steps += 1
    
    return backward_steps

# Apply the function to each group of visit_id
backward_steps = valid_visits.groupby('visit_id').apply(calculate_backward_steps).reset_index(name='backward_steps')

# Sum the total number of backward steps
total_backward_steps = backward_steps['backward_steps'].sum()

# Output the total number of backward steps
print(f"Total number of backward steps for all visit_id: {total_backward_steps}")


Total number of backward steps for all visit_id: 25805


In [None]:
# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps
def calculate_backward_steps(group):
    backward_steps = 0
    steps = group['process_step'].values
    
    for i in range(1, len(steps)):
        if steps[i] != steps[i-1] and step_map[steps[i]] < step_map[steps[i-1]]:
            backward_steps += 1
    
    return backward_steps

# Apply the function to each group of visit_id and Variation
backward_steps = valid_visits.groupby(['Variation', 'visit_id']).apply(calculate_backward_steps).reset_index(name='backward_steps')

# Sum the total number of backward steps for each Variation
total_backward_steps_by_variation = backward_steps.groupby('Variation')['backward_steps'].sum().reset_index()

# Output the total number of backward steps for each Variation
print("Total number of backward steps by Variation:")
print(total_backward_steps_by_variation)