In [None]:
import pandas as pd
from functools import reduce

In [None]:
# read data
num_consumer = pd.read_csv('../data/curated/pred_total_num_consumer.csv')
num_transaction = pd.read_csv('../data/curated/pred_total_num_transaction.csv')
revenue = pd.read_csv('../data/curated/pred_total_revenue.csv')
merchant_info = pd.read_csv('../data/curated/merchant.csv')

# merge three data frames based on merchant abn
data_frames = [merchant_info, num_consumer, num_transaction, revenue]
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['merchant_abn'], how='outer'), data_frames)
df_merged

In [None]:
# impute missing values and negative values with zero
df_merged = df_merged.fillna(0)
num = df_merged._get_numeric_data()
num[num < 0] = 0
df_merged

### Ranking System
The ranking system utilises predicted "total number of consumers", "total number of transactions" and "total revenue" next year and give each merchant a score within 0-100. The ranking score is calculated as follows:

1. Standardise each attribute using min-max normalization 
2. Total number of consumers * 30%
3. BNPL revenue * 40%
4. Total number of transactions * 30%

Revenue accounts for a larger proportion because it is considered to be an important thing to the BNPL company.


In [None]:
# min-max normalization
features = ['pred_total_num_consumer', 'pred_total_num_transaction', 'pred_total_revenue']
for col in features:
    df_merged[f'scaled_{col}'] = 100 * (df_merged[col] - df_merged[col].min()) / (df_merged[col].max() - df_merged[col].min())    

df_merged.head()

In [None]:
# calculate ranking score for each merchant
df_merged['score'] = 0.3*df_merged['scaled_pred_total_num_consumer'] + 0.3*df_merged['scaled_pred_total_num_transaction'] + 0.4*df_merged['scaled_pred_total_revenue']

df_merged['rank'] = df_merged['score'].rank(ascending=False)
df_merged = df_merged.set_index('rank').sort_index()
df_merged.head()

### Split Merchants into 4 Segments
Based on [Merchant Cateogry Groups by ANZ](https://www.anz.com/Documents/Business/CommercialCard/Merchant_cateogry_codes_control.pdf), we devide all merchant into 4 categories.

1. Health service: health, optician

2. Recreational good retailing: bicycle, books, stationary, hobby, tent, digital goods, 

3. Personal & household good retail: antique, watch, jewellery, music, artist supply, gift, art dealer, florists, furniture, shoe, garden supply, 

4. Technical & machinery service: cable, telecom, computer, equipment, motor

In [None]:
# split merchants into 4 segments
segment = {
    'furniture': 'personal & household good retail',
    'cable': 'technical & machinery service',
    'watch': 'personal & household good retail',
    'music': 'personal & household good retail',
    'gift': 'personal & household good retail',
    'computer': 'technical & machinery service',
    'equipment': 'technical & machinery service',
    'artist supply': 'personal & household good retail',
    'florists': 'personal & household good retail',
    'motor': 'technical & machinery service',
    'books': 'recreational good retailing',
    'jewelry': 'personal & household good retail',
    'stationery': 'recreational good retailing',
    'tent': 'recreational good retailing',
    'art dealer': 'personal & household good retail',
    'bicycle': 'recreational good retailing',
    'digital goods': 'recreational good retailing',
    'shoe': 'personal & household good retail',
    'opticians': 'health service',
    'antique': 'personal & household good retail',
    'health': 'health service',
    'hobby': 'recreational good retailing',
    'garden supply': 'personal & household good retail',
    'telecom': 'technical & machinery service'
}

df_merged['segment'] = df_merged['tags'].map(segment)

In [None]:
# find top 100 merchants overall
top100 = df_merged.loc[df_merged.index <= 100]
top100.to_csv('../data/curated/top100.csv')

In [None]:
# find top 10 merchants in each segment
personal_top10 = df_merged.loc[df_merged['segment']=='personal & household good retail'].head(10)
personal_top10.to_csv('../data/curated/personal_top10.csv')

In [None]:
technical_top10 = df_merged.loc[df_merged['segment']=='technical & machinery service'].head(10)
technical_top10.to_csv('../data/curated/technical_top10.csv')

In [None]:
recreational_top10 = df_merged.loc[df_merged['segment']=='recreational good retailing'].head(10)
recreational_top10.to_csv('../data/curated/recreational_top10.csv')

In [None]:
health_top10 = df_merged.loc[df_merged['segment']=='health service'].head(10)
health_top10.to_csv('../data/curated/health_top10.csv')