In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Loading the housing price dataset (assuming the file name is "housing_price.csv")
df = pd.read_csv('https://raw.githubusercontent.com/KLoVi/vanguard-ab-test/main/Datasets_cleaned/df_pt_1%262_ABC.csv')

In [2]:
df.head(5)
df.drop(columns = ["Unnamed: 0"], inplace = True)
df.nunique()

client_id        50500
visitor_id       56011
visit_id         69205
process_step         5
date_time       283344
year                 1
month                4
day                 31
time             69704
Variation            2
dtype: int64

In [3]:
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
df = df[df['process_step'].isin(step_order)]
df.nunique()

client_id        50500
visitor_id       56011
visit_id         69205
process_step         5
date_time       283344
year                 1
month                4
day                 31
time             69704
Variation            2
dtype: int64

In [4]:
list(zip(step_order[:-1], step_order[1:]))

[('start', 'step_1'),
 ('step_1', 'step_2'),
 ('step_2', 'step_3'),
 ('step_3', 'confirm')]

In [5]:
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)
df.head(25)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,year,month,day,time,Variation
106314,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,2017,4,26,13:22:17,Test
106313,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,2017,4,26,13:23:09,Test
235345,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,2017,4,9,16:20:56,Test
235344,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,2017,4,9,16:21:12,Test
235343,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,2017,4,9,16:21:21,Test
235342,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:35,2017,4,9,16:21:35,Test
235341,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:41,2017,4,9,16:21:41,Test
235340,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:45,2017,4,9,16:21:45,Test
235339,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:21:59,2017,4,9,16:21:59,Test
235338,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:22:04,2017,4,9,16:22:04,Test


In [6]:
df['date_time'] = pd.to_datetime(df['date_time'])
df['time_diff'] = df.groupby(['visit_id', 'Variation'])['date_time'].diff()
df['time_diff_seconds'] = df['time_diff'].dt.total_seconds()
df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,year,month,day,time,Variation,time_diff,time_diff_seconds
106314,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,2017,4,26,13:22:17,Test,NaT,
106313,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,2017,4,26,13:23:09,Test,0 days 00:00:52,52.0
235345,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,2017,4,9,16:20:56,Test,NaT,
235344,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,2017,4,9,16:21:12,Test,0 days 00:00:16,16.0
235343,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,2017,4,9,16:21:21,Test,0 days 00:00:09,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95173,6627522,730634087_44272418812,999988789_76411676596_272843,start,2017-04-21 23:49:11,2017,4,21,23:49:11,Test,NaT,
95172,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:49:22,2017,4,21,23:49:22,Test,0 days 00:00:11,11.0
95171,6627522,730634087_44272418812,999988789_76411676596_272843,step_2,2017-04-21 23:50:16,2017,4,21,23:50:16,Test,0 days 00:00:54,54.0
95170,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:51:00,2017,4,21,23:51:00,Test,0 days 00:00:44,44.0


In [7]:
# Convert date_time to datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Define the correct order of steps
step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Ensure we only consider rows with steps in the correct order
df = df[df['process_step'].isin(step_order)] #69,205 visit ID... Redundant step

# Sort DataFrame by visit_id, Variation -test and control-, and date_time -ascending-
df.sort_values(by=['visit_id', 'Variation', 'date_time'], inplace=True)

# Calculate time differences for each visit_id
df['time_diff'] = df.groupby(['visit_id', 'Variation'])['date_time'].diff()

# Convert time differences to total seconds
df['time_diff_seconds'] = df['time_diff'].dt.total_seconds()

# Function to calculate the average duration for transitions within each Variation group (work on subsets of df)
# In the code, group_df is not explicitly defined by name, but it is represented by 
# the variable group in the loop that iterates over the groups created by df.groupby('Variation')


def calculate_average_durations(group_df):
    
    #  initializing a dictionary transitions to store time differences for specific transitions between steps.
    transitions = {
        'start_to_step_1': [],
        'step_1_to_step_2': [],
        'step_2_to_step_3': [],
        'step_3_to_confirm': []
    }

    
    for visit_id, group in group_df.groupby('visit_id'):
        # Ensure the group is sorted by date_time, the chronological order
        group = group.sort_values('date_time')
        
        # Iterate over each pair of consecutive steps (the zip funtion);
        # This loop ensures that for each visit_id, we calculate the time taken to transition between each pair of consecutive steps, 
        # and store these times for later averaging.
        for step_from, step_to in zip(step_order[:-1], step_order[1:]):
            # Check if both steps are present in the current group
            if step_from in group['process_step'].values and step_to in group['process_step'].values:
                # Extract the timestamps for the two steps
                from_time = group[group['process_step'] == step_from]['date_time'].values[0]
                to_time = group[group['process_step'] == step_to]['date_time'].values[0]
                # Calculate the time difference in seconds
                time_diff_seconds = (to_time - from_time).astype('timedelta64[s]').astype(float)
                
                # Ensure time difference is positive
                if time_diff_seconds > 0:
                     # Store the time difference in the transitions dictionary
                    transition_name = f'{step_from}_to_{step_to}'
                    transitions[transition_name].append(time_diff_seconds)
    
    # Calculate average durations for each transition and
    # Constructing the New Dictionary: where 
    # the keys are the same as in transitions.
    # The values are the computed average time differences for each transition.
    average_durations = {key: (sum(value) / len(value) if len(value) > 0 else 0) for key, value in transitions.items()}
    
    return average_durations

# Split the DataFrame by Variation and calculate the average durations
# This loop iterates over each group created by df.groupby('Variation')
# group is a DataFrame that contains all rows of df where the Variation column has the value variation


# Iterate over each group of the DataFrame split by the 'Variation' column
# The results are stored in the results dictionary with 
# the variation as the key and the average durations as the value (which is also a dictionary, keys: transitions, values: average time differences for each transition)

results = {}
for variation, group in df.groupby('Variation'):
    results[variation] = calculate_average_durations(group)
    
# This loop iterates over each group created by df.groupby('Variation'). 

#For each iteration:
# variation: holds the name of the current group (i.e., the value of Variation for the current subset)
# group: is a DataFrame that contains all rows of df where the Variation column has the value variation.    
# In each iteration, the subset DataFrame group is passed to the function calculate_average_durations. 
# Inside the function, this subset is referred to as group_df.    

     
# Output the results
for variation, avg_durations in results.items():
    print(f"Variation: {variation}")
    # Convert the avg_durations dictionary to a list of tuples and then to a DataFrame:
    average_durations_df = pd.DataFrame(list(avg_durations.items()), columns=['Transition', 'Average Duration (seconds)'])
    print(average_durations_df)
    print("\n")

# avg_durations.items()returns the key-value pairs of the avg_durations dictionary as tuples.
# list(avg_durations.items()) converts these pairs to a list of tuples
# pd.DataFrame(list(avg_durations.items()), columns=['Transition', 'Average Duration (seconds)']) creates a DataFrame with the transitions as rows and their average durations as columns.

Variation: Control
          Transition  Average Duration (seconds)
0    start_to_step_1                   64.306905
1   step_1_to_step_2                   50.576086
2   step_2_to_step_3                  110.146508
3  step_3_to_confirm                  168.500918


Variation: Test
          Transition  Average Duration (seconds)
0    start_to_step_1                   52.481851
1   step_1_to_step_2                   76.060627
2   step_2_to_step_3                  107.843462
3  step_3_to_confirm                  143.410847


