In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import vanguard_functions as vf
from matplotlib.ticker import FuncFormatter
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency
from scipy import stats


%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
df_pt1 = pd.read_csv('df_final_web_data_pt_1.txt')
df_pt2 = pd.read_csv('df_final_web_data_pt_2.txt')
df_experi = pd.read_csv('df_final_experiment_clients.txt')
df_demo = pd.read_csv('df_final_demo.txt')


pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns', 90)



In [None]:
df_pt1.isnull().sum()

In [None]:
df_pt2.isnull().sum()

In [None]:
# combine original 3 data files: df_pt1 + df_pt2 + final_experiment 
df_12 = pd.concat([df_pt1,df_pt2],ignore_index=True)

In [None]:
vf.strip_replace_ws (df_demo)

In [None]:
df_demo.isnull().sum()

In [None]:
# drop null values
df_demo = df_demo.dropna()

In [None]:
df_demo.isnull().sum()

In [None]:
#rename and homogenize column names
column_replacements = {"gendr":"gender", "bal":"acct_balance", "clnt_tenure_yr":"tenure_years","clnt_tenure_mnth":"tenure_months","clnt_age":"age"}
vf.rename_columns (df_demo, column_replacements)

In [None]:
#recasting whole number columns to integer
vf.recast(df_demo, 'tenure_years','tenure_months','num_accts','calls_6_mnth','logons_6_mnth','age')

In [None]:
df_demo['gender'].value_counts()

In [None]:
#merging and renaming Unknown gender rows
df_demo['gender'] = df_demo['gender'].replace({"X":"Unknown","U":"Unknown"})
df_demo['gender'].value_counts()

## EDA

In [None]:
#analysing the age ditribution of the clients
print(df_demo['age'].describe())
print() 
print(f"The mode of client age:\n{df_demo['age'].mode()}")

In [None]:
sns.histplot(df_demo['age'], kde=False, color='blue')

In [None]:
#very strange age distribution, binning to smooth out the effect!

# Creating age bins:
bin_labels = [f'{i}-{i+5}' for i in range(0, 100, 5)]
df_demo['age_binned'] = pd.cut(df_demo['age'], bins=range(0, 101, 5), labels=bin_labels, right=False, ordered=True)

# Plot the binned data,sorted
plt.figure(figsize=(10, 6))
sns.histplot(df_demo['age_binned'], discrete=True, kde=False)
plt.xticks(rotation=45)
plt.title("Binned Age Distribution (5-year bins)")
plt.show()


As we can see, we have a bimodal distribution with most clients being either between 30-35 or 50-55 years old, with the mean and median being both around 47 years meaning most clients tend to be above middle age.

In [None]:
#visualising the gender distribution of the clients
df_demo['gender'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Set3"))

As we can see, we have a relatively even gender distribution between male, female and unknown, each about a third of the clients.

In [None]:
#analysing the tenure of clients
print(df_demo['tenure_years'].describe())
print() 
print(f"The mode of tenure years:\n{df_demo['tenure_years'].mode()}")

In [None]:
sns.histplot(df_demo['tenure_years'], kde=True, color='blue')

As we can see, most users tend to have been clients for around 6 years, with the tenure mean at 12 and the median at 11 years meaning over 50% longstanding users with more than 11 years tenure.

In [None]:
#analysing the amount of money in customer accounts
pd.options.display.float_format = '{:.4f}'.format
df_demo["acct_balance"].describe()

In [None]:
plt.figure(figsize=(10, 6))

sns.histplot(df_demo['acct_balance'], kde=True, color='blue')

# Format y-axis to avoid scientific notation
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{int(x):,}'))

In [None]:
df_demo.sort_values(by='acct_balance',ascending = False)

In [None]:
sns.boxplot(data = df_demo['acct_balance'])

In [None]:
#removing massive outliers
df_acct_no_outlier = df_demo[(df_demo['acct_balance'] <= df_demo['acct_balance'].quantile(0.95))]

In [None]:
plt.figure(figsize=(10, 6))

sns.histplot(df_acct_no_outlier['acct_balance'], kde=True, color='blue')

# Format y-axis to avoid scientific notation
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{int(x):,}'))

In [None]:
#analysing account balance by gender
df_demo.groupby("gender").agg({"acct_balance":["mean", "median","max","min","std"]}).reset_index()

In [None]:
sns.boxplot(data=df_demo, x='gender', y='acct_balance', palette="coolwarm")

In [None]:
sns.boxplot(data=df_acct_no_outlier, x='gender', y='acct_balance', palette="coolwarm")

In [None]:
df_acct_no_outlier.groupby("gender").agg({"acct_balance":["mean", "median","max","min","std"]}).reset_index()

As we can see, we have a heavily positively skewed distribution, we had to remove extreme outliers to get a readable visualisation. The mean sits at 147446 while the median is significantly lower at 63334. Gender-wise it seems relatively evenly distributed with men being a bit higher, after removing extreme outliers, which all belonged to men. People with unknown gender have a significantly lower mean and median.

In [None]:
#analysing tenure by gender
sns.boxplot(data=df_demo, x='gender', y='tenure_years', palette="coolwarm")

In [None]:
df_demo.groupby("gender").agg({"tenure_years":["mean", "median","max","min","std"]}).reset_index()

Men and women also appear to be similarly longstanding customers, while people with unknown gender tend to be relatively new customers. This makes sense considering that longstanding customers would have more interactions with the staff where at some point their gender would be stated.

In [None]:
#dropping rows that were not part of either the control or test group
df_experi_nonan = df_experi.dropna(subset=['Variation'])
df_experi_nonan.isnull().sum()


In [None]:
#merging the experiment dataframes
df_merged_12_experi = pd.merge(df_12,df_experi_nonan, on='client_id',how='inner')


In [None]:
#showing duplicates
duplicates = df_merged_12_experi[df_merged_12_experi.duplicated(keep=False)]
duplicates

In [None]:
#dropping duplicates 
df_merged_unique = df_merged_12_experi.drop_duplicates(keep='first')

In [None]:
#merging experiment data with client data
df_merged_unique_demo = pd.merge(df_merged_unique,df_demo, on='client_id',how='inner')

In [None]:
df_merged_unique_demo.isnull().sum()


In [None]:
#renaming
df_all = df_merged_unique_demo

In [None]:
#dividing the dataframe into test and control dataframes
df_control = df_all[df_all['Variation'] == "Control"]
df_test = df_all[df_all['Variation'] == "Test"]
df_control = df_control.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [None]:
# set all numbers display in normal format
pd.options.display.float_format ='{:,.4f}'.format

## KPI

In [None]:
# Calculate completion rate for control group
completion_rate_control = vf.calculate_completion_rate(df_control)
print(f"Completion rate for control group: {completion_rate_control}%")

# Calculate completion rate for test group
completion_rate_test = vf.calculate_completion_rate(df_test)
print(f"Completion rate for test group: {completion_rate_test}%")


In [None]:
#Completion rates visualisation
completion_rates = {
    'Group': ['Test', 'Control'],
    'Completion Rate (%)': [completion_rate_test, completion_rate_control]
}

completion_df = pd.DataFrame(completion_rates)

custom_palette = {'Control': 'lightcoral', 'Test': 'skyblue'}

plt.figure(figsize=(3, 6))
bar_plot = sns.barplot(x='Group', y='Completion Rate (%)', data=completion_df, palette=custom_palette)

plt.title('Completion Rates for Test and Control Group')
plt.xlabel('Group')
plt.ylabel('Completion Rate (%)')
plt.ylim(0, 100)  # Set y-axis limits from 0 to 100

# Annotate each bar with the completion rate
for p in bar_plot.patches:
    bar_plot.annotate(f'{p.get_height():.2f}%', 
                      (p.get_x() + p.get_width() / 2., p.get_height()), 
                      ha='center', va='bottom', 
                      fontsize=10, color='black', 
                      xytext=(0, 5), 
                      textcoords='offset points')

plt.savefig('completion_rates_plot.png', bbox_inches='tight', dpi=300)

plt.show()

The completion rate in the new design appears to be significantly higher than in the old design.

In [None]:
#Calculate error rates:
step_order_dict = {0: 'start', 1: 'step_1', 2: 'step_2', 3: 'step_3', 4: 'confirm'}

df_test, total_errors_test, error_rate_test = vf.calculate_error_rate(df_test, step_mapping=step_order_dict)
df_control, total_errors_control, error_rate_control = vf.calculate_error_rate(df_control,step_mapping=step_order_dict)

print(f"Total errors in test group: {total_errors_test}")
print(f"Error rate for test group: {error_rate_test}%")
print()
print(f"Total errors in control group: {total_errors_control}")
print(f"Error rate for control group: {error_rate_control}%")


In [None]:
#Error rates visualisation
error_rates = {
    'Group': ['Test', 'Control'],
    'Error Rate (%)': [error_rate_test, error_rate_control]
}

error_df = pd.DataFrame(error_rates)

custom_palette = {'Control': 'lightcoral', 'Test': 'skyblue'}


plt.figure(figsize=(3, 6))
bar_plot = sns.barplot(x='Group', y='Error Rate (%)', data=error_df, palette=custom_palette)


plt.title('Error Rates for Test and Control Groups')
plt.xlabel('Group')
plt.ylabel('Error Rate (%)')
plt.ylim(0, 100)  # Set y-axis limits from 0 to 100

# Annotate each bar with the error rate
for p in bar_plot.patches:
    bar_plot.annotate(f'{p.get_height():.2f}%', 
                      (p.get_x() + p.get_width() / 2., p.get_height()), 
                      ha='center', va='bottom', 
                      fontsize=10, color='black', 
                      xytext=(0, 5), 
                      textcoords='offset points')

plt.savefig('error_rates_plot.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

In [None]:
# Total errors and actions for control group
total_errors_control = df_control['errors'].sum()
total_actions_control = df_control.shape[0]

# Total errors and actions for test group
total_errors_test = df_test['errors'].sum()
total_actions_test = df_test.shape[0]

# Number of errors and total actions for both groups
errors = np.array([total_errors_control, total_errors_test])
actions = np.array([total_actions_control, total_actions_test])

# Perform the two-proportion z-test
z_stat, p_value = proportions_ztest(count=errors, nobs=actions)

# Output the z-statistic and p-value
print(f"Z-statistic: {z_stat}")
print(f"P-value: {p_value}")

# Interpretation of results
alpha = 0.05  
if p_value < alpha:
    print("The difference in error rates between control and test groups is statistically significant.")
else:
    print("The difference in error rates between control and test groups is not statistically significant.")


The error rate in the new design appears to be higher than in the old design.

In [None]:
#analyse the time spent on each step
df_control_timesort = vf.calculate_time_spent_per_step(df_control)
df_test_timesort = vf.calculate_time_spent_per_step(df_test)


In [None]:
df_control_timesort.groupby('process_step')['time_spent_seconds'].describe()

In [None]:
df_test_timesort.groupby('process_step')['time_spent_seconds'].describe()

In [None]:
#removing outliers
df_control_timesort_no_outliers = vf.tukeys_test_outliers(df_control_timesort,'time_spent_seconds', method = 'delete')
df_test_timesort_no_outliers = vf.tukeys_test_outliers(df_test_timesort,'time_spent_seconds', method = 'delete')

In [None]:
df_control_timesort_no_outliers.groupby('process_step')['time_spent_seconds'].describe()

In [None]:
df_test_timesort_no_outliers.groupby('process_step')['time_spent_seconds'].describe()

In [None]:
mean_control = df_control_timesort_no_outliers.groupby('process_step')['time_spent_seconds'].mean().reset_index()
mean_control

In [None]:
mean_test = df_test_timesort_no_outliers.groupby('process_step')['time_spent_seconds'].mean().reset_index()
mean_test

In [None]:
#steps to compare
steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Perform t-tests for each step
for step in steps:
    # Get time spent on the current step for the control group
    control_times = df_control_timesort_no_outliers[df_control_timesort_no_outliers['process_step'] == step]['time_spent_seconds']
    
    # Get time spent on the current step for the test group
    test_times = df_test_timesort_no_outliers[df_test_timesort_no_outliers['process_step'] == step]['time_spent_seconds']
    
    # Ensure the times are numeric and drop NaN values
    control_times = pd.to_numeric(control_times, errors='coerce').dropna()
    test_times = pd.to_numeric(test_times, errors='coerce').dropna()
    
    # Check if either group is empty
    if control_times.empty or test_times.empty:
        print(f"Insufficient data for step: {step} (control: {len(control_times)}, test: {len(test_times)})")
        continue
    
    # Debugging: Print the number of data points for each group
    print(f"{step.capitalize()} - Control group size: {len(control_times)}, Test group size: {len(test_times)}")
    
    # Perform Welch's t-test (does not assume equal variances)
    t_stat, p_value = stats.ttest_ind(control_times, test_times, equal_var=False)
    
    # Output the t-statistic and p-value for each step
    print(f"{step.capitalize()} - t-statistic: {t_stat}, p-value: {p_value}")

    alpha = 0.05  
    if p_value < alpha:
        print(f"Difference in {step} time is statistically significant.\n")
    else:
        print(f"No significant difference in {step} time.\n")

In [None]:
# Create a 'Group' column for both DataFrames
mean_control['Group'] = 'Control'
mean_test['Group'] = 'Test'

# Combine the two DataFrames
combined_df = pd.concat([mean_test, mean_control])

# Define the order of process steps excluding 'confirm'
order = ['start', 'step_1', 'step_2', 'step_3']

# Filter out the 'confirm' step from the combined DataFrame
combined_df = combined_df[combined_df['process_step'] != 'confirm']

# Create a custom color palette
custom_palette = {'Control': 'lightcoral', 'Test': 'skyblue'}

# Create the side-by-side bar plot
plt.figure(figsize=(7, 3))
sns.barplot(data=combined_df, x='process_step', y='time_spent_seconds', hue='Group', 
            palette=custom_palette, order=order)

# Add titles and labels
plt.title('Time Spent by Process Step for Test and Control Groups')
plt.xlabel('Process Step')
plt.ylabel('Time Spent (seconds)')

# Add legend
plt.legend(title='Group')

# Save the plot as an image
plt.savefig('time_spent_plot.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()


There is a statistically significant difference between the test and control groups for time spent, but they are practically very small differences, except for the last step where the control group spent significantly less time. At the start, at step 2 and step 3 the test group fared better, but at step 1 and the confirm step the control group needed less time. Overall a mixed result.

## HYPOTHESIS TESTING

H0: The updated design did not improve the completion rate

H1: The updated design led to a higher completion rate

Significance level (α): 0.05



In [None]:
# Calculate completion rate for control group
completion_rate_control = vf.calculate_completion_rate(df_control)
print(f"Completion rate for control group: {completion_rate_control}%")

# Calculate completion rate for test group
completion_rate_test = vf.calculate_completion_rate(df_test)
print(f"Completion rate for test group: {completion_rate_test}%")


In [None]:
# Number of users who completed the process (confirm) in control and test groups
cfm_user_control = df_control[df_control['process_step'] == 'confirm']['visit_id'].nunique()
cfm_user_test = df_test[df_test['process_step'] == 'confirm']['visit_id'].nunique()

# Total number of users in control and test groups
total_users_control = df_control['visit_id'].nunique()
total_users_test = df_test['visit_id'].nunique()

# Number of successes (confirmed users) for both groups
successes = [cfm_user_test, cfm_user_control]

# Number of total users in both groups
nobs = [total_users_test, total_users_control]

# Perform the z-test for proportions
test_stat, p_value = proportions_ztest(successes, nobs)

print(f"Z-statistic: {test_stat}")
print(f"P-value: {p_value}")

In [None]:
from statsmodels.stats.proportion import proportions_ztest

# Perform the z-test for proportions (two-tailed by default)
test_stat, p_value = proportions_ztest(successes, nobs)

# For a one-tailed test, if the test statistic is positive, divide the p-value by 2
if test_stat > 0:
    one_tailed_p_value = p_value / 2
else:
    one_tailed_p_value = 1 - p_value / 2  # if negative, it's in the opposite direction

print(f"Z-statistic: {test_stat}")
print(f"One-tailed P-value: {one_tailed_p_value}")


We reject the null-hypothesis (p-Value < 0.05, z_value more than 3 std away from the expected mean when H0 = true)

Threshold: Vanguard has set this minimum increase in completion rate at 5%. This is the rate at which the projected benefits, in terms of increased user engagement and potential revenue, are estimated to outweigh the costs of the new design.


You are required to carry out another analysis, ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold. If the new design doesn’t lead to at least this level of improvement, it may not be justifiable from a cost perspective, regardless of its statistical significance%

H0 : The new update did not result in an increas in completion rate of at least 5%.

H1 : The new update did result in an increase in completion rate of at least 5%.


In [None]:
#completion rate increase: 
comp_rate_increase = 58.52 - 49.84
comp_rate_increase

In [None]:
# Perform the z-test for proportions for 5% threshhold
test_stat, p_value = proportions_ztest(successes, nobs, value=0.05, alternative = "larger")

print(f"Z-statistic: {test_stat}")
print(f"P-value: {p_value}")

The completion rate increase is around 8.68% which is significantly more than 5%, and the difference is statistically significant, meaning the new design is justifiable from a cost perspective.

In [None]:
df_control['age'].describe()

In [None]:
df_test['age'].describe()

In [None]:
df_control['tenure_years'].describe()

In [None]:
df_test['tenure_years'].describe()

In [None]:
df_control['gender'].value_counts()

In [None]:
df_test['gender'].value_counts()

In [None]:
# Design Effectiveness
# Was the experiment well-structured?
# Were clients randomly and equally divided between the old and new designs?
# Were there any biases?

# 1-both test and control group have the same data structure (same column names/ same value type per column) 
# 2- both groups had large and similar data size (test group 176699 rows/control group：140536 rows) 

# we collect all main key columns('gender','tenure_years','age', 'acct_balance','num_accts','calls_6_mnth','logons_6_mnth')  to test if any column 
# has distribution bias between test and control group
# here we first test the category column 'gender' by Chi-square


# H0: gender column are equally distributed 
# H1: gender column are not equally distributed 

gender_counts_control = df_control['gender'].value_counts()
gender_counts_test = df_test['gender'].value_counts()
contingency_table = pd.DataFrame({'Control': gender_counts_control, 'Test': gender_counts_test}).fillna(0)
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-square test statistic: {chi2}")
print(f"P-value for gender distribution: {p} means it right to support H1: gender column are not equally distributed ")

# then we further cross check above hypothesis for each gendr's percentage in both group and get contradictory result:
# Control group : unknown 34.48%/ Male:33.75%/ Female:31.77% ///Test group : unknown33.51%/ Male:33.56%/ Female:32.93%
# Very likely caused by a huge sample size which magnifies the minor differences in distribution in statistic

print(df_control['gender'].value_counts(normalize=True)*100)
print(df_test['gender'].value_counts(normalize=True)*100)


In [None]:
# Gender counts for Control and Test groups
gender_counts_control = df_control['gender'].value_counts()
gender_counts_test = df_test['gender'].value_counts()

# Total sample sizes for Control and Test groups
n_control = len(df_control)
n_test = len(df_test)

# Perform z-test for each gender category
for gender in ['M', 'F', 'Unknown']:
    count_control = gender_counts_control.get(gender, 0)
    count_test = gender_counts_test.get(gender, 0)
    
    # Combine the counts and sample sizes
    counts = np.array([count_control, count_test])
    nobs = np.array([n_control, n_test])
    

    z_stat, p_value = proportions_ztest(counts, nobs)
    
    print(f"Gender: {gender}")
    print(f"Z-statistic: {z_stat:.4f}, P-value: {p_value}\n")


We then decided to calculate **Cohen's h**, which is a common way to measure the **effect size** for comparing two proportions. It will help determine whether the differences in gender proportions are not only statistically significant but also practically significant.

### Formula for Cohen's h:

The formula for Cohen's h is:

$$
h = 2 \times \left( \arcsin \left( \sqrt{p_1} \right) - \arcsin \left( \sqrt{p_2} \right) \right)
$$

Where:
- \( p_1 \) is the proportion in the control group,
- \( p_2 \) is the proportion in the test group.

### Cohen's h Interpretation:
- **0.2**: Small effect size
- **0.5**: Medium effect size
- **0.8**: Ln practical terms.


In [None]:
# Proportions for each gender in control and test groups
proportions_control = gender_counts_control / n_control
proportions_test = gender_counts_test / n_test

# Calculate Cohen's h for each gender category
for gender in ['M', 'F', 'Unknown']:
    p1 = proportions_control.get(gender, 0)
    p2 = proportions_test.get(gender, 0)
    
    h = vf.cohen_h(p1, p2)
    print(f"Gender: {gender}, Cohen's h: {h:.4f}")


Interpretation of the results:
Positive Cohen's h: The proportion in the test group is larger than in the control group.
Negative Cohen's h: The proportion in the test group is smaller than in the control group.

M (Male): Cohen's h = 0.0040

This is a very small positive value, indicating that the male proportion in the test group is slightly larger than in the control group, but the difference is tiny (almost negligible).

F (Female): Cohen's h = -0.0248

This small negative value means the female proportion in the test group is slightly smaller than in the control group. Again, the difference is very small.

Unknown: Cohen's h = 0.0206

This small positive value means the proportion of "Unknown" gender is slightly larger in the test group compared to the control group, but still very minor.


Practical Significance:

All values of Cohen's h are below 0.2, meaning these differences are very small and not practically significant.
In summary, even though some of the proportions are statistically significantly different (as indicated by the p-values), the effect size (Cohen's h) shows that the differences are practically very minor, and likely not meaningful in a real-world context.

In [None]:
# here we use T-test hypothesis to check if any of the below key continuous data columns has a distribution bias between test VS control group
# tested columns ":tenure_years','age', 'acct_balance','num_accts','calls_6_mnth','logons_6_mnth' 

# H0: each one of them are equally distributed 
# H1: each one of them are not equally distributed 

from scipy.stats import ttest_ind
t_stat, p_value = ttest_ind(df_control['tenure_years'],df_test['tenure_years'],equal_var=False)
print(f" column tenure_years t_stat:{t_stat}, p value:{p_value}")
#print (f"above get p value =0.56 which means we fail to reject HO hypothsis,so this column is truely equally distributed")

t_stat, p_value = ttest_ind(df_control['age'],df_test['age'],equal_var=False)
print(f" column age t_stat:{t_stat}, p value:{p_value}")
# result: p_value <0.01

t_stat, p_value = ttest_ind(df_control['acct_balance'],df_test['acct_balance'],equal_var=False)
print(f" column acct_balance t_stat:{t_stat}, p value:{p_value}")
# result: p_value <0.01

t_stat, p_value = ttest_ind(df_control['num_accts'],df_test['num_accts'],equal_var=False)
print(f" column num_accts:{t_stat}, p value:{p_value}")
# result: p_value <0.01

t_stat, p_value = ttest_ind(df_control['calls_6_mnth'],df_test['calls_6_mnth'],equal_var=False)
print(f" column calls_6_mnth:{t_stat}, p value:{p_value}")
# result: p_value <0.01

t_stat, p_value = ttest_ind(df_control['logons_6_mnth'],df_test['logons_6_mnth'],equal_var=False)
print(f" column logons_6_mnth:{t_stat}, p value:{p_value}")
# result: p_value <0.01
print("------------------------")

# for all above 5 columns which statistic P-value < 0.01 we are here to further check if bias distributed by compare each distinct value's percentage in each group:
# below are general date to refer to

#  column name:             value range                gap   
# 'num_accts':              2-7
# 'calls_6_mnth':           0-6l
# 'logons_6_mnth':          3-9  
# 'age' :              17-96                      79
# 'acct_balance':                    23,789.61-- 8,292,996.21   8269207

columns = ['num_accts','calls_6_mnth','logons_6_mnth']

for col in columns:
    control_counts = df_control[col].value_counts(normalize=True) * 100
    control_counts = control_counts.rename("control_percentage").reset_index()
    control_counts = control_counts.rename(columns={"index": col})
    test_counts = df_test[col].value_counts(normalize=True) * 100
    test_counts = test_counts.rename("test_percentage").reset_index()
    test_counts = test_counts.rename(columns={"index": col})
    combine_test_control_df = pd.merge(control_counts, test_counts, on=col, how='outer').fillna(0)
    print(f" percentage comparison for {col}")
    print(combine_test_control_df)
    print("------------------------")
  

# here continue check column 'clnt_age' if bias:
control_age_groups = pd.cut(df_control['age'], bins=[17, 29, 39, 49, 59, 69, 79], labels=['17-29', '30-39', '40-49', '50-59', '60-69', '70-79'])
test_age_groups = pd.cut(df_test['age'], bins=[17, 29, 39, 49, 59, 69, 79], labels=['17-29', '30-39', '40-49', '50-59', '60-69', '70-79'])

control_age_counts = control_age_groups.value_counts(normalize=True) * 100
control_age_counts = control_age_counts.rename("control_percentage").reset_index()
control_age_counts = control_age_counts.rename(columns={"index": "age_group"})
test_age_counts = test_age_groups.value_counts(normalize=True) * 100
test_age_counts = test_age_counts.rename("test_percentage").reset_index()
test_age_counts = test_age_counts.rename(columns={"index": "age_group"})
clnt_age_test_control_df = pd.merge(control_age_counts, test_age_counts, on="age",how='outer')
clnt_age_test_control_df


# here continue check column 'bal' if bias:
control_bal_groups = pd.cut(df_control['acct_balance'], 
                            bins=[20000, 50000, 100000, 200000, 300000, 500000,1000000,9000000],
                            labels=['20000-50000', '50001-100000', '100001-200000', '200001-300000', '300001-500000', '500001-1000000','1000001-9000000'])

test_bal_groups = pd.cut(df_test['acct_balance'], 
                            bins=[20000, 50000, 100000, 200000, 300000, 500000,1000000,9000000],
                            labels=['20000-50000', '50001-100000', '100001-200000', '200001-300000', '300001-500000', '500001-1000000','1000001-9000000'])

control_bal_counts = control_bal_groups.value_counts(normalize=True) * 100
control_bal_counts = control_bal_counts.rename("control_percentage").reset_index()
control_bal_counts = control_bal_counts.rename(columns={"index": "bal_group"})

test_bal_counts = test_bal_groups.value_counts(normalize=True) * 100
test_bal_counts = test_bal_counts.rename("test_percentage").reset_index()
test_bal_counts = test_bal_counts.rename(columns={"index": "bal_group"})
test_bal_counts

bal_test_control_df = pd.merge(control_bal_counts, test_bal_counts, on="acct_balance",how='outer')
print(bal_test_control_df)

# All above futher check result shows in all key columns('gender','tenure_years','age', 'acct_balance','num_accts','calls_6_mnth','logons_6_mnth') 
# each distinct value's percentage gap between test VS control group are all less than 2% ,so we finally conclude : none of them is bias distributed. 





To check for practical significance, we calculate Cohen's d (similar to Cohen's h but with continuous variables). The formula for Cohen's D is:

## Cohen's d Formula

Cohen's $d$ is calculated as:

$$
d = \frac{M_1 - M_2}{s}
$$

Where:
- $M_1$ = Mean of the first group
- $M_2$ = Mean of the second group
- $s$ = Pooled standard deviation, calculated as:

$$
s = \sqrt{\frac{(n_1 - 1)s_1^2 + (n_2 - 1)s_2^2}{n_1 + n_2 - 2}}
$$

Where:
- $n_1$ = Sample size of the first group
- $n_2$ = Sample size of the second group
- $s_1$ = Standard deviation of the first group
- $s_2$ = Standard deviation of the second group


In [None]:
columns = ['tenure_years', 'age', 'acct_balance', 'num_accts', 'calls_6_mnth', 'logons_6_mnth']

for col in columns:
    # Ensure the values are numeric and remove any NaN values
    control_data = pd.to_numeric(df_control[col], errors='coerce').dropna()
    test_data = pd.to_numeric(df_test[col], errors='coerce').dropna()
    
    # Calculate Cohen's d
    d_value = vf.cohen_d(control_data, test_data)
    
    print(f"Cohen's d for {col}: {d_value:.4f}")
 

From this output, you can see that while the differences were statistically significant from their p-values, the Cohen's d value for the columns is small, suggesting that the difference between the control and test groups is minimal in terms of effect size.

In [None]:
# How do clients navigate through the old versus the new digital process? Do they follow similar steps or diverge at certain points?

# Calculate the total number of errors for each group
total_errors_control = df_sorted_control['is_back'].sum()
total_errors_test = df_sorted_test['is_back'].sum()

# Count backward steps for control group
back_counts_control = df_sorted_control['back_label'].value_counts().fillna(0)

# Count backward steps for test group
back_counts_test = df_sorted_test['back_label'].value_counts().fillna(0)

# Calculate the percentage of each backward step from the total errors
back_ratio_control = (back_counts_control / total_errors_control * 100).round(1).astype(str) + '%'
back_ratio_test = (back_counts_test / total_errors_test * 100).round(1).astype(str) + '%'

# Create result DataFrames with updated back ratios
result_control = pd.DataFrame({'back_count_control': back_counts_control, 'back_ratio_control': back_ratio_control})
result_test = pd.DataFrame({'back_count_test': back_counts_test, 'back_ratio_test': back_ratio_test})

# Combine results into a single DataFrame
df_combined_bonus = pd.merge(result_test, result_control, on='back_label', how='outer')

# Convert back ratios to float for normalization
df_combined_bonus['back_ratio_test'] = df_combined_bonus['back_ratio_test'].str.rstrip('%').astype('float') / 100.0
df_combined_bonus['back_ratio_control'] = df_combined_bonus['back_ratio_control'].str.rstrip('%').astype('float') / 100.0

# Normalizing to make percentages add up to 100%
df_combined_bonus['back_ratio_test'] = df_combined_bonus['back_ratio_test'] * 100 / df_combined_bonus['back_ratio_test'].sum()
df_combined_bonus['back_ratio_control'] = df_combined_bonus['back_ratio_control'] * 100 / df_combined_bonus['back_ratio_control'].sum()

# Prepare for plotting
labels = df_combined_bonus.index.tolist()  
x = np.arange(len(labels)) 
width = 0.35  

fig, ax = plt.subplots(figsize=(7, 3))

# Create the bars
bar1 = ax.bar(x - width/2, df_combined_bonus['back_ratio_test'], width, label='Test', color='skyblue')
bar2 = ax.bar(x + width/2, df_combined_bonus['back_ratio_control'], width, label='Control', color='lightcoral')

# Update axis labels and title
ax.set_xlabel('Backward Steps')
ax.set_ylabel('Ratio of Total Errors (%)')
ax.set_title('Backward (Error) Ratio by Step for Test and Control Groups')
ax.set_xticks(x)  
ax.set_xticklabels(labels) 
ax.legend(title='Group')  

# Function to add percentage labels on the bars
def add_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.1f}%',  
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  
                    textcoords="offset points",
                    ha='center', va='bottom')

# Add labels to the bars
add_labels(bar1)
add_labels(bar2)

# Set y-axis to show percentage scale
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: '{:.0f}%'.format(x)))

plt.ylim(0, 100)  # Set y-axis limits from 0 to 100%

plt.savefig('step_error_ratio.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

In [None]:
As we can see here, the most errors in the new design are made at step 1.

## CREATING A COMPREHENSIVE DATAFRAME WITH EVERYTHING FOR TABLEAU

In [None]:
combined_df_timesort = pd.concat([df_control_timesort, df_test_timesort], ignore_index=True)

In [None]:
combined_df_timesort['acct_balance'] = combined_df_timesort['acct_balance'].round(2)

In [None]:
combined_df_timesort['date_time'] = pd.to_datetime(combined_df_timesort['date_time'])

In [None]:
combined_df_timesort.to_csv('combined_df_timesort.csv', sep=';', index=False)