In [1]:
import pandas as pd
from functools import reduce

In [2]:
# read data
num_consumer = pd.read_csv('../data/curated/pred_total_num_consumer.csv')
num_transaction = pd.read_csv('../data/curated/pred_total_num_transaction.csv')
revenue = pd.read_csv('../data/curated/pred_total_revenue.csv')
merchant_info = pd.read_csv('../data/curated/merchant.csv')

# merge three data frames based on merchant abn
data_frames = [merchant_info, num_consumer, num_transaction, revenue]
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['merchant_abn'], how='outer'), data_frames)
df_merged

Unnamed: 0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue
0,10023283211,Felis Limited,furniture,e,0.18,2131.143386,2215.463738,5.135353e+05
1,10142254217,Arcu Ac Orci Corporation,cable,b,4.22,1904.682106,1983.130850,3.639309e+05
2,10165489824,Nunc Sed Company,jewelry,b,4.40,,,
3,10187291046,Ultricies Dignissim Lacus Foundation,watch,b,3.29,232.439735,230.906400,7.583703e+04
4,10192359162,Enim Condimentum PC,music,a,6.33,243.700159,240.472985,5.817489e+05
...,...,...,...,...,...,...,...,...
4021,99938978285,Elit Dictum Eu Ltd,opticians,b,4.50,9310.360959,11822.405141,1.562756e+06
4022,99974311662,Mollis LLP,books,b,3.17,97.297453,101.509131,6.501456e+04
4023,99976658299,Sociosqu Corp.,shoe,a,6.57,11453.366640,15851.457335,1.665152e+07
4024,99987905597,Commodo Hendrerit LLC,motor,a,6.82,101.449943,105.900595,1.552476e+05


In [3]:
# impute missing values and negative values with zero
df_merged = df_merged.fillna(0)
num = df_merged._get_numeric_data()
num[num < 0] = 0
df_merged

Unnamed: 0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue
0,10023283211,Felis Limited,furniture,e,0.18,2131.143386,2215.463738,5.135353e+05
1,10142254217,Arcu Ac Orci Corporation,cable,b,4.22,1904.682106,1983.130850,3.639309e+05
2,10165489824,Nunc Sed Company,jewelry,b,4.40,0.000000,0.000000,0.000000e+00
3,10187291046,Ultricies Dignissim Lacus Foundation,watch,b,3.29,232.439735,230.906400,7.583703e+04
4,10192359162,Enim Condimentum PC,music,a,6.33,243.700159,240.472985,5.817489e+05
...,...,...,...,...,...,...,...,...
4021,99938978285,Elit Dictum Eu Ltd,opticians,b,4.50,9310.360959,11822.405141,1.562756e+06
4022,99974311662,Mollis LLP,books,b,3.17,97.297453,101.509131,6.501456e+04
4023,99976658299,Sociosqu Corp.,shoe,a,6.57,11453.366640,15851.457335,1.665152e+07
4024,99987905597,Commodo Hendrerit LLC,motor,a,6.82,101.449943,105.900595,1.552476e+05


### Ranking System
The ranking system utilises predicted "total number of consumers", "total number of transactions" and "total revenue" next year and give each merchant a score within 0-100. The ranking score is calculated as follows:

1. Standardise each attribute using min-max normalization 
2. Total number of consumers * 30%
3. BNPL revenue * 40%
4. Total number of transactions * 30%

Revenue accounts for a larger proportion because it is considered to be an important thing to the BNPL company.


In [4]:
# min-max normalization
features = ['pred_total_num_consumer', 'pred_total_num_transaction', 'pred_total_revenue']
for col in features:
    df_merged[f'scaled_{col}'] = 100 * (df_merged[col] - df_merged[col].min()) / (df_merged[col].max() - df_merged[col].min())    

df_merged.head()

Unnamed: 0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue,scaled_pred_total_num_consumer,scaled_pred_total_num_transaction,scaled_pred_total_revenue
0,10023283211,Felis Limited,furniture,e,0.18,2131.143386,2215.463738,513535.330544,9.486013,1.111959,1.195689
1,10142254217,Arcu Ac Orci Corporation,cable,b,4.22,1904.682106,1983.13085,363930.943809,8.478003,0.995349,0.847358
2,10165489824,Nunc Sed Company,jewelry,b,4.4,0.0,0.0,0.0,0.0,0.0,0.0
3,10187291046,Ultricies Dignissim Lacus Foundation,watch,b,3.29,232.439735,230.9064,75837.027719,1.034621,0.115894,0.176575
4,10192359162,Enim Condimentum PC,music,a,6.33,243.700159,240.472985,581748.930901,1.084743,0.120695,1.354514


In [5]:
# calculate ranking score for each merchant
df_merged['score'] = 0.3*df_merged['scaled_pred_total_num_consumer'] + 0.3*df_merged['scaled_pred_total_num_transaction'] + 0.4*df_merged['scaled_pred_total_revenue']

df_merged['rank'] = df_merged['score'].rank(ascending=False)
df_merged = df_merged.set_index('rank').sort_index()
df_merged.head()

Unnamed: 0_level_0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue,scaled_pred_total_num_consumer,scaled_pred_total_num_transaction,scaled_pred_total_revenue,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.0,86578477987,Leo In Consulting,watch,a,6.43,18424.119848,187019.058114,42948920.0,82.008301,93.866355,100.0,92.762397
2.0,45629217853,Lacus Consulting,gift,a,6.98,20247.863835,152088.705358,41344550.0,90.126037,76.334533,96.264467,88.443957
3.0,89726005175,Est Nunc Consulting,tent,a,6.01,20487.094245,148292.570591,37103500.0,91.190884,74.429222,86.389832,84.241965
4.0,49891706470,Non Vestibulum Industries,tent,a,5.8,19617.385531,169851.702773,29164610.0,87.319691,85.249922,67.905351,78.933024
5.0,21439773999,Mauris Non Institute,cable,a,6.1,22427.261712,81994.383461,36715890.0,99.826838,41.153634,85.487345,76.48908


### Split Merchants into 4 Segments
Based on [Merchant Cateogry Groups by ANZ](https://www.anz.com/Documents/Business/CommercialCard/Merchant_cateogry_codes_control.pdf), we devide all merchant into 4 categories.

1. Health service: health, optician

2. Recreational good retailing: bicycle, books, stationary, hobby, tent, digital goods, 

3. Personal & household good retail: antique, watch, jewellery, music, artist supply, gift, art dealer, florists, furniture, shoe, garden supply, 

4. Technical & machinery service: cable, telecom, computer, equipment, motor

In [6]:
# split merchants into 4 segments
segment = {
    'furniture': 'personal & household good retail',
    'cable': 'technical & machinery service',
    'watch': 'personal & household good retail',
    'music': 'personal & household good retail',
    'gift': 'personal & household good retail',
    'computer': 'technical & machinery service',
    'equipment': 'technical & machinery service',
    'artist supply': 'personal & household good retail',
    'florists': 'personal & household good retail',
    'motor': 'technical & machinery service',
    'books': 'recreational good retailing',
    'jewelry': 'personal & household good retail',
    'stationery': 'recreational good retailing',
    'tent': 'recreational good retailing',
    'art dealer': 'personal & household good retail',
    'bicycle': 'recreational good retailing',
    'digital goods': 'recreational good retailing',
    'shoe': 'personal & household good retail',
    'opticians': 'health service',
    'antique': 'personal & household good retail',
    'health': 'health service',
    'hobby': 'recreational good retailing',
    'garden supply': 'personal & household good retail',
    'telecom': 'technical & machinery service'
}

df_merged['segment'] = df_merged['tags'].map(segment)

In [7]:
# find top 100 merchants overall
top100 = df_merged.loc[df_merged.index <= 100]
top100.to_csv('../data/curated/top100.csv')

In [8]:
# find top 10 merchants in each segment
personal_top10 = df_merged.loc[df_merged['segment']=='personal & household good retail'].head(10)
personal_top10.to_csv('../data/curated/personal_top10.csv')

In [9]:
technical_top10 = df_merged.loc[df_merged['segment']=='technical & machinery service'].head(10)
technical_top10.to_csv('../data/curated/technical_top10.csv')

In [10]:
recreational_top10 = df_merged.loc[df_merged['segment']=='recreational good retailing'].head(10)
recreational_top10.to_csv('../data/curated/recreational_top10.csv')

In [11]:
health_top10 = df_merged.loc[df_merged['segment']=='health service'].head(10)
health_top10.to_csv('../data/curated/health_top10.csv')