In [1]:
import pandas as pd
pd.options.display.max_colwidth = 200

import seaborn as sns
sns.set(rc={'figure.figsize':(15,5)})

import matplotlib.pyplot as plt
import numpy as np

In [2]:
merchant_agg = pd.read_csv("../data/curated/clusters/input/agg_transaction_train.csv")[['merchant_abn', 'total_number_of_distinct_customers', 
    'monthly_average_number_of_orders', 'monthly_average_bnpl_revenue']]
merchant_clusters = pd.read_csv("../data/curated/clusters/output/merchant_clusters.csv")

display(merchant_agg.head(3))
display(merchant_clusters.head(3))

Unnamed: 0,merchant_abn,total_number_of_distinct_customers,monthly_average_number_of_orders,monthly_average_bnpl_revenue
0,10023283211,2619,154.333333,33301.586223
1,10342410215,729,40.888889,15624.420327
2,10346855916,7,1.4,2186.800497


Unnamed: 0,merchant_abn,label
0,10023283211,1
1,10342410215,2
2,10346855916,0


**Merge datasets**

In [3]:
merged_feature_cluster = merchant_clusters.merge(merchant_agg, on='merchant_abn', how='inner').drop('merchant_abn', axis=1)
display(merged_feature_cluster.head(3))

Unnamed: 0,label,total_number_of_distinct_customers,monthly_average_number_of_orders,monthly_average_bnpl_revenue
0,1,2619,154.333333,33301.586223
1,2,729,40.888889,15624.420327
2,0,7,1.4,2186.800497


**Perfrom log transformation**

In [4]:
agg_byCluster = pd.DataFrame(np.log(merged_feature_cluster.iloc[:,1:])).add_prefix('log_')
agg_byCluster['cluster'] = pd.Categorical(merged_feature_cluster.label)

display(agg_byCluster.head(3))
display(agg_byCluster.groupby('cluster').describe().T)

Unnamed: 0,log_total_number_of_distinct_customers,log_monthly_average_number_of_orders,log_monthly_average_bnpl_revenue,cluster
0,7.870548,5.039115,10.41336,1
1,6.591674,3.710858,9.65659,2
2,1.94591,0.336472,7.690195,0


Unnamed: 0,cluster,0,1,2
log_total_number_of_distinct_customers,count,924.0,824.0,2251.0
log_total_number_of_distinct_customers,mean,3.0413,8.656969,5.944994
log_total_number_of_distinct_customers,std,1.094127,0.634698,1.005133
log_total_number_of_distinct_customers,min,0.0,7.590347,3.637586
log_total_number_of_distinct_customers,25%,2.484907,8.101071,5.144579
log_total_number_of_distinct_customers,50%,3.295837,8.550047,5.940171
log_total_number_of_distinct_customers,75%,3.828641,9.131622,6.792344
log_total_number_of_distinct_customers,max,5.087596,10.089137,7.922624
log_monthly_average_number_of_orders,count,924.0,824.0,2251.0
log_monthly_average_number_of_orders,mean,0.713308,5.981607,3.067234


## Plot the Marginal Distances

In [5]:
def marginal_error(x_variable, agg_byCluster=agg_byCluster):
    '''this function creates plots of marginal distances between each cluster with respect to each feature'''
    sns.scatterplot(data=agg_byCluster, hue='cluster', style='cluster', x=x_variable, y=agg_byCluster['cluster'], palette='deep')

    # compute boundaries for clusters, outliers are removed using 1.5IQR
    description = agg_byCluster.groupby('cluster')[x_variable].describe().T
    description.loc["lb",:] = description.loc["25%"]-1.5*(description.loc["75%"]-description.loc["25%"])
    description.loc["ub",:] = description.loc["75%"]+1.5*(description.loc["75%"]-description.loc["25%"])
    description.loc["chosen_lower"] = np.where(description.loc["lb",:] < description.loc["min",:], description.loc["min",:], description.loc["lb",:])
    description.loc["chosen_upper"] = np.where(description.loc["ub",:] < description.loc["max",:], description.loc["ub",:], description.loc["max",:])

    # visualizing the boundaries
    colour = ['b', 'orange', 'green']
    cluster = [0,1,2]
    for i in range(0,3):
        plt.axvline(x =  description.loc["chosen_lower",cluster[i]], color = colour[i], linestyle = ':')
        plt.axvline(x =  description.loc["chosen_upper",cluster[i]], color = colour[i], linestyle = ':')
        
    # plot the marginal distances
    cluster_sequence = description.loc['chosen_lower',:].T.sort_values().index.to_list()
    
    for i in [0,1]:
        x_value = description.loc["chosen_upper",cluster_sequence[i]]-(description.loc["chosen_upper",cluster_sequence[i]]-description.loc["chosen_lower", cluster_sequence[i+1]])/2
        plt.axvline(x = x_value, color = 'r', label = f'MoE_{cluster_sequence[i],cluster_sequence[i+1]} ≈ {round(x_value,2)}')
        
    plt.legend()
    # plt.savefig(f'../plots/{x_variable}.jpg')
    # plt.clf()
    
    return