In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest


In [None]:
df_original = pd.read_csv('final.csv')

cols = ['client_id','visit_id','process_step','date_time','variation_cleaned','client_tenure_year','client_tenure_month','client_age','gender_cleaned','number_of_accounts','calls_6_month','logos_6_month']
df_original = df_original[cols]
df_original.rename(columns={'process_step':'step','variation_cleaned':'variation','client_tenure_year':'tenureYear','client_tenure_month':'tenureMonth','gender_cleaned':'gender','client_age':'age'
                            },inplace=True)

df_original['date_time'] = pd.to_datetime(df_original['date_time'])

df_original.shape

##### create id 

In [None]:
df_original['id'] = df_original['client_id'].astype('str') + '__' + df_original['visit_id'].astype('str')

##### age distribution + create age group

In [None]:
unique_clients = df_original.drop_duplicates(subset="client_id")
sns.histplot(unique_clients[unique_clients['age']>0]['age'],bins=20,palette="Blues" )
plt.show()

age_bins =[0,25,35,50,65,80,np.inf]
age_labels = ["<25", "25-35", "36-45", "46-65", "66-80", "80+"]
df_original['age_group']= pd.cut(df_original['age'].replace(-1,np.nan),bins=age_bins,labels=age_labels,right=True).cat.add_categories('unknown').fillna('unknown')




##### create tenure year group

In [None]:
unique_clients = df_original.drop_duplicates(subset="client_id")
sns.histplot(unique_clients[unique_clients['tenureYear']>0]['tenureYear'],bins=20 )
plt.show()


df_original["tenureYear_group"] = pd.cut(
    df_original["tenureYear"].replace(-1, np.nan),
    bins=[0, 5, 10,15, 20, float("inf")],
    labels=["New (<5y)", "Mid-New(5-10y)","Mid (5-15y)","Mid-Long(15-20y)" ,"Long (>20y)"],
    right=False
).cat.add_categories("Unknown").fillna("Unknown")



In [None]:
df_original.shape

#### EDA ----- Are the primary clients new or long-standing? 

In [None]:
# voilin plot
unique_clients = df_original.drop_duplicates(subset="client_id")

sns.violinplot(data=unique_clients[
                                     (unique_clients['tenureYear'] >= 0)
                                     ], x='age_group', y='tenureYear')
sns.despine()
plt.show()

In [None]:
unique_clients.tenureYear.dtype
unique_clients['tenureYear'].describe()

#### EDA --- age distribution in test and control

1. The shapes of the curves are very similar, we can tell from the graph that test and Control are balanced in age distribution


In [None]:

sns.histplot(x = 'age',
             hue = 'variation',
             data=unique_clients[~(unique_clients['variation']=='unknown')],
             
                 # shaded KDE
    alpha=0.4,
     bins=10,
            fill=False, 
            # adjust number of bins
    element='step',       # outline instead of filled bars
    stat='count',         # or 'density' if you want normalized
    common_norm=False,    # keeps group scaling separate
    multiple='layer',
    palette={'Control': 'black', 'Test': 'red'})
plt.show()


#### EDA - tenure year distribution in test and control
1. The shapes are nearly identical

In [None]:
sns.histplot(x = 'tenureYear',
             hue = 'variation',
             data=unique_clients[~(unique_clients['variation']=='unknown')],
             
                 # shaded KDE
    alpha=0.4,
     bins=10,
            fill=False, 
            # adjust number of bins
    element='step',       # outline instead of filled bars
    stat='count',         # or 'density' if you want normalized
    common_norm=False,    # keeps group scaling separate
    multiple='layer',
    palette={'Control': 'black', 'Test': 'red'})
plt.show()

#### EDA - gender distribution in test and control
1. we can see from the graph that gender distribution is well balanced in test/control

In [None]:

pivot = unique_clients[~((unique_clients['gender']=='unknown')|(unique_clients['variation']=='unknown'))].pivot_table(index='variation',columns='gender',aggfunc='count',values='client_id')

pivot.plot(kind='bar',
           stacked=True,
           figsize=(6,6),
           color=["darkred", "darkblue"],
           width = 0.5)
plt.legend(loc ='upper right',bbox_to_anchor=(1.2, 1))
sns.despine()
plt.xticks(rotation = 0)
plt.show()

#### KPI - completion rate

In [None]:
# define a function to calculate confirmation rate 
def confirmation_rate (df):
    confirmed = df[df['step']=='confirm']
    numerator = confirmed['id'].nunique()
    denominator = df['id'].nunique()
    return numerator,denominator,numerator/denominator

df_original = df_original.copy()
df_test = df_original[df_original['variation'] == 'Test' ]
df_control = df_original[df_original['variation'] == 'Control' ]


c_rate_control = confirmation_rate(df_control)[2]
print(f"confirmation rate for control group is {c_rate_control}")
c_rate_test = confirmation_rate(df_test)[2]
print(f"confirmation rate for test group is {c_rate_test}")

#### KPI - Time Spent on Each Step: The average duration users spend on each step.

In [None]:
def calculate_duration(df):
    df = df.copy()
    df['step_number'] = df['step'].map({'start':1,'step_1':2,'step_2':3,'step_3':4,'confirm':5})
    df.sort_values(by=['id','date_time'],ascending=True,inplace=True)
    df['duration'] = df.groupby('id')['date_time'].shift(-1) - df['date_time']
    df['duration'] = df['duration'].dt.total_seconds()
    return df

df_original = calculate_duration(df_original)
df_original.shape


In [None]:
# check if there is duration < 0 
(df_original['duration']<0).sum()


In [None]:
# check if there is outliers in duration, 
print(df_original['duration'].describe())

IQR_duration = df_original['duration'].quantile(0.75) - df_original['duration'].quantile(0.25)
lower = df_original['duration'].quantile(0.25) - 1.5*IQR_duration
high = df_original['duration'].quantile(0.75) + 1.5*IQR_duration  # 185

# remove duration higher than 185, it might skew average 

df = df_original.copy()
df = df_original[df_original['duration']<= 185]


In [None]:
# calculation for test group
test = df[df['variation'] == 'Test'].groupby(['id','step'])['duration'].agg('sum').reset_index().groupby(['step'])['duration'].agg(['mean','median'])

# calculation for test group
control = df[df['variation'] == 'Control'].groupby(['id','step'])['duration'].agg('sum').reset_index().groupby(['step'])['duration'].agg(['mean','median'])

result = pd.concat(
    [test.add_prefix('Test_'), control.add_prefix('Control_')], axis=1
)

result

#### KPI - Error Rates: If there's a step where users go back to a previous step, it may indicate confusion or an error. You should consider moving from a later step to an earlier one as an error.

In [None]:
df_original.head(5)

In [None]:
df_original['step_check'] = df_original.groupby('id')['step_number'].diff()
df_original['error'] = df_original['step_check'].apply(lambda x: 1 if x <0 else 0 )

df_original

In [None]:
totalError_test = df_original[df_original['variation'] =='Test'].groupby('id')['error'].agg('sum').reset_index(name='sum')
test_errorRate = len(totalError_test[totalError_test['sum']>=1])/len(totalError_test)

totalError_control = df_original[df_original['variation'] =='Control'].groupby('id')['error'].agg('sum').reset_index(name='sum')
control_errorRate = len(totalError_control[totalError_control['sum']>=1])/len(totalError_control)

print(f"Error rate in test group is {test_errorRate}")
print(f"Error rate in Control group is {control_errorRate}")

#### Hypothesis Testing - age & test/control

In [None]:
# step1: Hypothesis 
    # H0: age distribution is the same across test/control (independent)
    # H1: age distribution is not the same across test/control (dependent)

# step2: set significance level = 5%, if null is true, i accept 5% probablity that I incorrectly reject true null

# step3: perform chi-square test
data=unique_clients[~(unique_clients['variation']=='unknown')]
cross_age_variation = pd.crosstab(index=data['variation'],columns=data['age_group'],normalize=False)
display(cross_age_variation)

res_age = chi2_contingency(cross_age_variation)

print(f"chi-square statistics is {res_age.statistic:.2f}")
print(f"chi-square P-value is {res_age.pvalue:.2f}, fail to reject null, so age distribution is the same across test/control.")
print(f"\nImportant TakeAway:\ntest and control are balanced in client age,so age is unlikely to bias later A/B testing results.")

#### Hypothesis Testing - gender & test/control

In [None]:
# step1: Hypothesis 
    # H0: gender distribution is the same across test/control (independent)
    # H1: gender distribution is not the same across test/control (dependent)

# step2: set significance level = 5%, if null is true, i accept 5% probablity that I incorrectly reject true null

# step3: perform chi-square test
pivot = unique_clients[~((unique_clients['gender']=='unknown')|(unique_clients['variation']=='unknown'))].pivot_table(index='variation',columns='gender',aggfunc='count',values='client_id')
display(pivot)
res_gender = chi2_contingency(pivot)


print(f"chi-square statistics is {res_gender.statistic:.2f}")
print(f"chi-square P-value is {res_gender.pvalue:.2f},fail to reject null,so gender distribution is the same across test and control.")
print(f"\nImportant TakeAway: \ntest and control are balanced in client gender,so gender is unlikely to bias later A/B testing results.")


#### Hypothesis Testing - tenure year & test/control

In [None]:

# step1: Hypothesis 
    # H0: tenure year distribution is the same across test/control (independent)
    # H1: tenure year distribution is not the same across test/control (dependent)

# step2: set significance level = 5%, if null is true, i accept 5% probablity that I incorrectly reject true null

# step3: perform chi-square test
data=unique_clients[~(unique_clients['variation']=='unknown')]

cross_tenureYear_variation = pd.crosstab(index=data['variation'],columns=data['tenureYear_group'],normalize=False)
display(cross_tenureYear_variation)
res_tenureyear = chi2_contingency(cross_tenureYear_variation)


print(f"chi-square statistics is {res_tenureyear.statistic:.2f}")
print(f"chi-square P-value is {res_tenureyear.pvalue:.2f},fail to reject null,so tenure year distribution is the same across test and control.")
print(f"\nImportant TakeAway: \ntest and control are balanced in client tenure year,so tenure year is unlikely to bias later A/B testing results.")


#### Hypothesis Testing - completion rate

In [None]:
# H0​:p_test ​= p_control​ 
# H1:p_test​ != p_control​  

# samller , alternative : p_test < p_control
# larger, alternative : p_test > p_control  xx

# 1-sided: larger-> 16088>21791, smaller:16088<21791

count = np.array([21791, 16088])  # [test confirm, control confirm]  
nobs = np.array([37204, 32243])  # [test total, control total] 

z_stat, pvalue = proportions_ztest(count, nobs,alternative='larger')

display(z_stat,pvalue)

print(f"takeaway: p_test > p_control is significant")

#### Hypothesis Testing - completion rate with cost consideration

In [None]:

# H0​:p_test - p_control = 5%
# H1:p_test​ - p_control​  != 5%

# samller , alternative : p_test​ - p_control < 5%
# larger, alternative :  p_test​ - p_control > 5%


count = np.array([21791, 16088])  # [test confirm, control confirm]  
nobs = np.array([37204, 32243])  # [test total, control total] 

z_stat, pvalue = proportions_ztest(count, nobs,value= 0.05, alternative='larger')

display(z_stat,pvalue)

print(f"takeaway:  p_test​ - p_control > 5% is significant, so the improvement rate is higher than 5%")