In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Loading the housing price dataset (assuming the file name is "housing_price.csv")
df = pd.read_csv('https://raw.githubusercontent.com/KLoVi/vanguard-ab-test/main/Datasets_cleaned/df_pt_1%262_ABC.csv')

In [2]:
df.head(5)
df.drop(columns = ["Unnamed: 0"], inplace = True)
df.nunique()

client_id        50500
visitor_id       56011
visit_id         69205
process_step         5
date_time       283344
year                 1
month                4
day                 31
time             69704
Variation            2
dtype: int64

In [3]:
# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

#step_map creates a dictionary that maps each step to its index in step_order. 
# This will help in comparing the order of steps later on
# The comprehension iterates over these tuples, creating key-value pairs 
# where step is the key and idx is the value.
step_map = {step: idx for idx, step in enumerate(step_order)}

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)]

# Sort DataFrame by visit_id, Variation, and date_time
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Filter out visit_id which only have 'confirm' step and at least one 'start' step
# filter() is used to apply a function to each group and retain only those groups for which the function returns True
valid_visits = df.groupby('visit_id').filter(lambda x: 'start' in x['process_step'].values and len(x['process_step'].values) > 1)

# Calculate backward steps and total steps
# Total Steps: The number of times the process moves from one step to another, regardless of the direction.
# Backward Steps: The number of times the process moves to a step with a lower index, indicating a deviation from the expected order.

def calculate_steps(group):
    total_steps = 0
    backward_steps = 0
    
    # group['process_step'].values retrieves the values as an array, making it easier to work with sequential operations.
    steps = group['process_step'].values
    
    # Loop through the steps starting from the second step (index 1) up to the end of the list.
    # The loop starts from 1 because we need to compare each step with the previous one
    for i in range(1, len(steps)):
        # This condition helps in identifying when a new step begins, thereby counting the total number of step transitions.
        if steps[i] != steps[i-1]:
            # total_steps counts the number of changes between different steps.
            #  Increment the total_steps counter whenever there is a change from one step to another.
            total_steps += 1
            
            #Determine if the transition is backward by comparing the indices of the current and previous steps using the step_map dictionary.
            # Details: step_map[steps[i]] retrieves the index of the current step, and step_map[steps[i-1]] retrieves the index of the previous step. 
            # If the index of the current step is less than the previous step, it indicates a backward transition
            if step_map[steps[i]] < step_map[steps[i-1]]:
                # backward_steps counts the number of times the process step deviates to an earlier step according to the step_map.
                backward_steps += 1
    
    return total_steps, backward_steps

# Apply the function to each group of Variation and visit_id
#  The DataFrame might have a multi-level index where Variation and visit_id are part of the index, and steps is a Series with tuples.
# After Reset: The index is flattened, and Variation and visit_id are now regular columns.
results = valid_visits.groupby(['Variation', 'visit_id']).apply(calculate_steps).reset_index(name='steps')

# To split the tuple of (total_steps, backward_steps) into two separate columns for easier analysis.
# results['steps'].tolist(): Converts the steps column, which contains tuples of (total_steps, backward_steps), into a list of tuples.
results[['total_steps', 'backward_steps']] = pd.DataFrame(results['steps'].tolist(), index=results.index)

# Sum the total steps and backward steps for each Variation
total_steps_by_variation = results.groupby('Variation')['total_steps'].sum().reset_index(name='total_steps')
backward_steps_by_variation = results.groupby('Variation')['backward_steps'].sum().reset_index(name='backward_steps')

# Merge the two DataFrames
merged_results = pd.merge(total_steps_by_variation, backward_steps_by_variation, on='Variation')

# Calculate the error rates
merged_results['error_rate'] = (merged_results['backward_steps'] / merged_results['total_steps'])*100

# Output the results
print("Error rates by Variation:")
print(merged_results)


Error rates by Variation:
  Variation  total_steps  backward_steps  error_rate
0   Control        93502            9504   10.164488
1      Test       119375           16194   13.565654


In [4]:
valid_visits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300021 entries, 235345 to 95169
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   client_id     300021 non-null  int64         
 1   visitor_id    300021 non-null  object        
 2   visit_id      300021 non-null  object        
 3   process_step  300021 non-null  object        
 4   date_time     300021 non-null  datetime64[ns]
 5   year          300021 non-null  int64         
 6   month         300021 non-null  int64         
 7   day           300021 non-null  int64         
 8   time          300021 non-null  object        
 9   Variation     300021 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(5)
memory usage: 25.2+ MB


In [17]:
valid_visits["client_id"].value_counts()

client_id
9638063    68
2313292    61
4167815    59
2261960    58
5165430    58
           ..
1672891     2
1975168     2
8784101     1
8889840     1
325599      1
Name: count, Length: 46144, dtype: int64

In [10]:
results = valid_visits.groupby(['Variation', 'visit_id']).apply(calculate_steps).reset_index(name='steps')

In [11]:
results

Unnamed: 0,Variation,visit_id,steps
0,Control,100037962_47432393712_705583,"(2, 1)"
1,Control,100057941_88477660212_944512,"(6, 1)"
2,Control,10006594_66157970412_679648,"(5, 1)"
3,Control,10007589_47780784567_391490,"(4, 0)"
4,Control,100096068_8301717872_987164,"(1, 0)"
...,...,...,...
55945,Test,999960019_60838685252_926860,"(5, 2)"
55946,Test,999971096_28827267783_236076,"(4, 0)"
55947,Test,999976049_95772503197_182554,"(4, 0)"
55948,Test,999984454_18731538378_781808,"(4, 0)"


In [12]:
results[['total_steps', 'backward_steps']] = pd.DataFrame(results['steps'].tolist(), index=results.index)

In [13]:
results

Unnamed: 0,Variation,visit_id,steps,total_steps,backward_steps
0,Control,100037962_47432393712_705583,"(2, 1)",2,1
1,Control,100057941_88477660212_944512,"(6, 1)",6,1
2,Control,10006594_66157970412_679648,"(5, 1)",5,1
3,Control,10007589_47780784567_391490,"(4, 0)",4,0
4,Control,100096068_8301717872_987164,"(1, 0)",1,0
...,...,...,...,...,...
55945,Test,999960019_60838685252_926860,"(5, 2)",5,2
55946,Test,999971096_28827267783_236076,"(4, 0)",4,0
55947,Test,999976049_95772503197_182554,"(4, 0)",4,0
55948,Test,999984454_18731538378_781808,"(4, 0)",4,0


In [14]:
total_steps_by_variation = results.groupby('Variation')['total_steps'].sum().reset_index(name='total_steps')
total_steps_by_variation

Unnamed: 0,Variation,total_steps
0,Control,93502
1,Test,119375


In [15]:
backward_steps_by_variation = results.groupby('Variation')['backward_steps'].sum().reset_index(name='backward_steps')
backward_steps_by_variation 

Unnamed: 0,Variation,backward_steps
0,Control,9504
1,Test,16194


In [16]:
merged_results = pd.merge(total_steps_by_variation, backward_steps_by_variation, on='Variation')
merged_results

Unnamed: 0,Variation,total_steps,backward_steps
0,Control,93502,9504
1,Test,119375,16194


In [17]:
merged_results['error_rate'] = (merged_results['backward_steps'] / merged_results['total_steps'])*100
merged_results

Unnamed: 0,Variation,total_steps,backward_steps,error_rate
0,Control,93502,9504,10.164488
1,Test,119375,16194,13.565654


In [18]:
results_to_DEMO = results

In [19]:
results_to_DEMO 

Unnamed: 0,Variation,visit_id,steps,total_steps,backward_steps
0,Control,100037962_47432393712_705583,"(2, 1)",2,1
1,Control,100057941_88477660212_944512,"(6, 1)",6,1
2,Control,10006594_66157970412_679648,"(5, 1)",5,1
3,Control,10007589_47780784567_391490,"(4, 0)",4,0
4,Control,100096068_8301717872_987164,"(1, 0)",1,0
...,...,...,...,...,...
55945,Test,999960019_60838685252_926860,"(5, 2)",5,2
55946,Test,999971096_28827267783_236076,"(4, 0)",4,0
55947,Test,999976049_95772503197_182554,"(4, 0)",4,0
55948,Test,999984454_18731538378_781808,"(4, 0)",4,0


In [22]:
#results_to_DEMO.to_csv("/Users/karollvivianalopezvillegas/GitHub/vanguard-ab-test/Tableau/Errors_for_DEMO.csv",  index=False)