In [1]:
import pandas as pd
from functools import reduce

In [2]:
# read data
num_consumer = pd.read_csv('../data/curated/pred_total_num_consumer.csv')
num_transaction = pd.read_csv('../data/curated/pred_total_num_transaction.csv')
revenue = pd.read_csv('../data/curated/pred_total_revenue.csv')
merchant_info = pd.read_csv('../data/curated/merchant.csv')

# merge three data frames based on merchant abn
data_frames = [merchant_info, num_consumer, num_transaction, revenue]
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['merchant_abn'], how='outer'), data_frames)
df_merged

Unnamed: 0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue
0,10023283211,Felis Limited,furniture,e,0.18,2131.244593,2215.621019,4.383170e+05
1,10142254217,Arcu Ac Orci Corporation,cable,b,4.22,1904.912599,1983.366936,3.296245e+05
2,10165489824,Nunc Sed Company,jewelry,b,4.40,,,
3,10187291046,Ultricies Dignissim Lacus Foundation,watch,b,3.29,232.648056,231.167248,9.578310e+04
4,10192359162,Enim Condimentum PC,music,a,6.33,241.747418,238.402629,7.323220e+05
...,...,...,...,...,...,...,...,...
4021,99938978285,Elit Dictum Eu Ltd,opticians,b,4.50,9315.105067,11827.344860,1.324647e+06
4022,99974311662,Mollis LLP,books,b,3.17,97.661042,101.943103,8.080374e+04
4023,99976658299,Sociosqu Corp.,shoe,a,6.57,11441.692197,15838.308518,1.651803e+07
4024,99987905597,Commodo Hendrerit LLC,motor,a,6.82,102.357842,106.959180,2.045275e+05


In [3]:
# impute missing values and negative values with zero
df_merged = df_merged.fillna(0)
num = df_merged._get_numeric_data()
num[num < 0] = 0
df_merged

Unnamed: 0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue
0,10023283211,Felis Limited,furniture,e,0.18,2131.244593,2215.621019,4.383170e+05
1,10142254217,Arcu Ac Orci Corporation,cable,b,4.22,1904.912599,1983.366936,3.296245e+05
2,10165489824,Nunc Sed Company,jewelry,b,4.40,0.000000,0.000000,0.000000e+00
3,10187291046,Ultricies Dignissim Lacus Foundation,watch,b,3.29,232.648056,231.167248,9.578310e+04
4,10192359162,Enim Condimentum PC,music,a,6.33,241.747418,238.402629,7.323220e+05
...,...,...,...,...,...,...,...,...
4021,99938978285,Elit Dictum Eu Ltd,opticians,b,4.50,9315.105067,11827.344860,1.324647e+06
4022,99974311662,Mollis LLP,books,b,3.17,97.661042,101.943103,8.080374e+04
4023,99976658299,Sociosqu Corp.,shoe,a,6.57,11441.692197,15838.308518,1.651803e+07
4024,99987905597,Commodo Hendrerit LLC,motor,a,6.82,102.357842,106.959180,2.045275e+05


### Ranking System
The ranking system utilises predicted "total number of consumers", "total number of transactions" and "total revenue" next year and give each merchant a score within 0-100. The ranking score is calculated as follows:

1. Standardise each attribute using min-max normalization 
2. Total number of consumers * 30%
3. BNPL revenue * 40%
4. Total number of transactions * 30%

Revenue accounts for a larger proportion because it is considered to be an important thing to the BNPL company.


In [4]:
# min-max normalization
features = ['pred_total_num_consumer', 'pred_total_num_transaction', 'pred_total_revenue']
for col in features:
    df_merged[f'scaled_{col}'] = 100 * (df_merged[col] - df_merged[col].min()) / (df_merged[col].max() - df_merged[col].min())    

df_merged.head()

Unnamed: 0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue,scaled_pred_total_num_consumer,scaled_pred_total_num_transaction,scaled_pred_total_revenue
0,10023283211,Felis Limited,furniture,e,0.18,2131.244593,2215.621019,438316.982483,9.502798,1.11225,1.071427
1,10142254217,Arcu Ac Orci Corporation,cable,b,4.22,1904.912599,1983.366936,329624.53217,8.493628,0.995658,0.805738
2,10165489824,Nunc Sed Company,jewelry,b,4.4,0.0,0.0,0.0,0.0,0.0,0.0
3,10187291046,Ultricies Dignissim Lacus Foundation,watch,b,3.29,232.648056,231.167248,95783.104746,1.037332,0.116047,0.234133
4,10192359162,Enim Condimentum PC,music,a,6.33,241.747418,238.402629,732321.957084,1.077904,0.119679,1.790096


In [5]:
# calculate ranking score for each merchant
df_merged['score'] = 0.3*df_merged['scaled_pred_total_num_consumer'] + 0.3*df_merged['scaled_pred_total_num_transaction'] + 0.4*df_merged['scaled_pred_total_revenue']

df_merged['rank'] = df_merged['score'].rank(ascending=False)
df_merged = df_merged.set_index('rank').sort_index()
df_merged.head()

Unnamed: 0_level_0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue,scaled_pred_total_num_consumer,scaled_pred_total_num_transaction,scaled_pred_total_revenue,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.0,86578477987,Leo In Consulting,watch,a,6.43,18356.113566,186946.749066,39925150.0,81.846275,93.84799,97.593499,91.745679
2.0,45629217853,Lacus Consulting,gift,a,6.98,20191.435558,152028.388588,38193320.0,90.029612,76.318838,93.360191,87.248612
3.0,89726005175,Est Nunc Consulting,tent,a,6.01,20434.182107,148236.132238,35873460.0,91.111971,74.415111,87.689512,84.73393
4.0,49891706470,Non Vestibulum Industries,tent,a,5.8,19570.482775,169802.295225,30463440.0,87.260907,85.241408,74.465184,81.536768
5.0,21439773999,Mauris Non Institute,cable,a,6.1,22388.332445,81951.886245,37059750.0,99.825141,41.140163,90.589293,78.525308


### Split Merchants into 4 Segments
Based on [Merchant Cateogry Groups by ANZ](https://www.anz.com/Documents/Business/CommercialCard/Merchant_cateogry_codes_control.pdf), we devide all merchant into 4 categories.

1. Health service: health, optician

2. Recreational good retailing: bicycle, books, stationary, hobby, tent, digital goods, 

3. Personal & household good retail: antique, watch, jewellery, music, artist supply, gift, art dealer, florists, furniture, shoe, garden supply, 

4. Technical & machinery service: cable, telecom, computer, equipment, motor

In [6]:
# split merchants into 4 segments
segment = {
    'furniture': 'personal & household good retail',
    'cable': 'technical & machinery service',
    'watch': 'personal & household good retail',
    'music': 'personal & household good retail',
    'gift': 'personal & household good retail',
    'computer': 'technical & machinery service',
    'equipment': 'technical & machinery service',
    'artist supply': 'personal & household good retail',
    'florists': 'personal & household good retail',
    'motor': 'technical & machinery service',
    'books': 'recreational good retailing',
    'jewelry': 'personal & household good retail',
    'stationery': 'recreational good retailing',
    'tent': 'recreational good retailing',
    'art dealer': 'personal & household good retail',
    'bicycle': 'recreational good retailing',
    'digital goods': 'recreational good retailing',
    'shoe': 'personal & household good retail',
    'opticians': 'health service',
    'antique': 'personal & household good retail',
    'health': 'health service',
    'hobby': 'recreational good retailing',
    'garden supply': 'personal & household good retail',
    'telecom': 'technical & machinery service'
}

df_merged['segment'] = df_merged['tags'].map(segment)

In [7]:
# find top 100 merchants overall
top100 = df_merged.loc[df_merged.index <= 100]
top100.to_csv('../data/curated/top100.csv')

In [13]:
top100.head(20)

Unnamed: 0_level_0,merchant_abn,name,tags,revenue_level,take_rate,pred_total_num_consumer,pred_total_num_transaction,pred_total_revenue,scaled_pred_total_num_consumer,scaled_pred_total_num_transaction,scaled_pred_total_revenue,score,segment
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,86578477987,Leo In Consulting,watch,a,6.43,18356.113566,186946.749066,39925150.0,81.846275,93.84799,97.593499,91.745679,personal & household good retail
2.0,45629217853,Lacus Consulting,gift,a,6.98,20191.435558,152028.388588,38193320.0,90.029612,76.318838,93.360191,87.248612,personal & household good retail
3.0,89726005175,Est Nunc Consulting,tent,a,6.01,20434.182107,148236.132238,35873460.0,91.111971,74.415111,87.689512,84.73393,recreational good retailing
4.0,49891706470,Non Vestibulum Industries,tent,a,5.8,19570.482775,169802.295225,30463440.0,87.260907,85.241408,74.465184,81.536768,recreational good retailing
5.0,21439773999,Mauris Non Institute,cable,a,6.1,22388.332445,81951.886245,37059750.0,99.825141,41.140163,90.589293,78.525308,technical & machinery service
6.0,32361057556,Orci In Consequat Corporation,gift,a,6.61,21544.352688,58447.433467,39867280.0,96.062002,29.340837,97.452059,76.601675,personal & household good retail
7.0,64403598239,Lobortis Ultrices Company,music,a,6.31,22427.549126,77913.665936,35278950.0,100.0,39.112961,86.236284,76.228402,personal & household good retail
8.0,43186523025,Lorem Ipsum Sodales Industries,florists,b,4.47,21166.87885,138185.373344,27543230.0,94.378921,69.369591,67.326996,76.055352,personal & household good retail
9.0,24852446429,Erat Vitae LLP,florists,c,2.94,18360.089252,199201.65464,19451030.0,81.864002,100.0,47.546338,73.577736,personal & household good retail
10.0,94493496784,Dictum Phasellus In Institute,gift,a,5.65,22215.379057,67344.125535,33613700.0,99.053976,33.807011,82.165735,72.72459,personal & household good retail


In [8]:
# find top 10 merchants in each segment
personal_top10 = df_merged.loc[df_merged['segment']=='personal & household good retail'].head(10)
personal_top10.to_csv('../data/curated/personal_top10.csv')

In [9]:
technical_top10 = df_merged.loc[df_merged['segment']=='technical & machinery service'].head(10)
technical_top10.to_csv('../data/curated/technical_top10.csv')

In [10]:
recreational_top10 = df_merged.loc[df_merged['segment']=='recreational good retailing'].head(10)
recreational_top10.to_csv('../data/curated/recreational_top10.csv')

In [11]:
health_top10 = df_merged.loc[df_merged['segment']=='health service'].head(10)
health_top10.to_csv('../data/curated/health_top10.csv')