In [2]:
import pandas as pd
import re
from collections import Counter, OrderedDict
import numpy as np
# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# lemma = WordNetLemmatizer()
# from sklearn.feature_extraction.text import CountVectorizer

### Merchants data preprocessing

In [3]:
merchants_df = pd.read_parquet('../data/tables/tbl_merchants.parquet').reset_index()

In [4]:
# A sample of an entry in the tags columns of merchants data
merchants_df.head(1).loc[0, 'tags']

'((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))'

In [5]:
#First separte text into 3 separate features (separated by () or [])
def separate_tags(row):
    features = re.findall(r'[\(|\[][\(|\[](.*)[\)|\]],\s[\(|\[](.*)[\)|\]],\s[\(|\[](.*)[\)|\]][\)|\]]', row['tags'])
    row['feature_1'] = features[0][0]
    row['feature_2'] = features[0][1]
    row['feature_3'] = features[0][2]
    return row
merchants_df = merchants_df.apply(separate_tags, axis = 1)

In [6]:
# convert the take rate feature (feature_2) to float type
def get_take_rate(x):
    feature = re.findall('take rate: (\d+\.\d+)', x)
    return float(feature[0])

merchants_df['take_rate'] = merchants_df['feature_3'].apply(get_take_rate)
merchants_df = merchants_df.rename(columns = {'feature_2': 'revenue_level', 'feature_1': 'category'})
merchants_df = merchants_df.drop(columns = ['tags', 'feature_3'])

In [7]:
merchants_df[["merchant_abn", "revenue_level"]].to_csv("../data/curated/abn_band.csv")

In [7]:
# 971 Unique instances of the unpreprocessed tags
# Preprocessing 'category' occurs in this cell

merchants_df['category'] = merchants_df['category'].str.lower()
merchants_df['category'] = merchants_df['category'].str.split(' and ')

new_category = []
for e in merchants_df['category']:
    new = ', '.join(e)
    new_category.append(new)
merchants_df['category'] = new_category

merchants_df['category'] = merchants_df['category'].str.split(',')

new_category = []
for e in merchants_df['category']: 
    # # Lemmatizing
    # for i in e:
    #     new_e.append(lemma.lemmatize(i))
    
    new_e = []
    # Removing leading and trailing whitespace
    for i in e:
        word = i.lstrip().rstrip()
        word = re.sub(r'\s{2,}', ' ', word)
        new_e.append(word)

    # Removing empty options in list
    removeIndex = []
    for i in range(len(new_e)):
        if not new_e[i]:
            removeIndex.insert(0, i)
    for i in removeIndex:
        new_e.pop(i)     

    new_category.append(new_e)

merchants_df['category'] = new_category

In [9]:
merged_categories = []
for e in merchants_df['category']:
    merged_categories.extend(e)
print(len(merged_categories))
C = Counter(merged_categories)
# print(len(C))
# print(C)

11731


In [10]:
merchants_df

Unnamed: 0,merchant_abn,name,category,revenue_level,take_rate
0,10023283211,Felis Limited,"[furniture, home furnishings, equipment shops,...",e,0.18
1,10142254217,Arcu Ac Orci Corporation,"[cable, satellite, other pay television, radio...",b,4.22
2,10165489824,Nunc Sed Company,"[jewelry, watch, clock, silverware shops]",b,4.40
3,10187291046,Ultricies Dignissim Lacus Foundation,"[watch, clock, jewelry repair shops]",b,3.29
4,10192359162,Enim Condimentum PC,"[music shops - musical instruments, pianos, sh...",a,6.33
...,...,...,...,...,...
4021,99938978285,Elit Dictum Eu Ltd,"[opticians, optical goods, eyeglasses]",b,4.50
4022,99974311662,Mollis LLP,"[books, periodicals, newspapers]",b,3.17
4023,99976658299,Sociosqu Corp.,[shoe shops],a,6.57
4024,99987905597,Commodo Hendrerit LLC,"[motor vehicle supplies, new parts]",a,6.82


In [11]:
merchants_df.to_parquet('../data/curated/merchants_df.parquet')

In [2]:
merchants_df = pd.read_parquet('../data/curated/merchants_df.parquet')

Implementing a rudimentary heuristic for ranking 'category', based on the average revenue level associated with the category

In [170]:
revenue_level_dict = {category : [] for category in list(C)}

for index in merchants_df.index:
    for category in merchants_df['category'][index]:
        revenue_level_dict[category].append(merchants_df['revenue_level'][index])

# for category in list(C):
#     C2 = Counter(revenue_level_dict[category])
#     revenue_level_dict[category] = C2

for category in list(C):    
    sum = 0
    instances = 0
    for revenue_level in revenue_level_dict[category]:
        match revenue_level:
            case "a":
                sum += 1
            case "b":
                sum += 2
            case "c":
                sum += 3
            case "d":
                sum += 4
            case "e":
                sum += 5
        instances += 1
    revenue_level_dict[category] = sum / instances

# Reference: https://www.geeksforgeeks.org/python-sort-python-dictionaries-by-key-or-value/
sorted_value_index = np.argsort(revenue_level_dict.values())
dictionary_keys = list(revenue_level_dict.keys())
sorted_dict = {dictionary_keys[i]: sorted(
    revenue_level_dict.values())[i] for i in range(len(dictionary_keys))}


sorted_dict

{'furniture': 1.7527472527472527,
 'home furnishings': 1.7527472527472527,
 'equipment shops': 1.7527472527472527,
 'manufacturers': 1.7527472527472527,
 'except appliances': 1.7647058823529411,
 'cable': 1.7647058823529411,
 'satellite': 1.7647058823529411,
 'other pay television': 1.7724550898203593,
 'radio services': 1.7724550898203593,
 'jewelry': 1.7724550898203593,
 'watch': 1.7802197802197801,
 'clock': 1.7802197802197801,
 'silverware shops': 1.7802197802197801,
 'jewelry repair shops': 1.7802197802197801,
 'music shops - musical instruments': 1.8054054054054054,
 'pianos': 1.8102564102564103,
 'sheet music': 1.8102564102564103,
 'gift': 1.8102564102564103,
 'card': 1.8176470588235294,
 'novelty': 1.822857142857143,
 'souvenir shops': 1.822857142857143,
 'computers': 1.822857142857143,
 'computer peripheral equipment': 1.822857142857143,
 'software': 1.8232044198895028,
 'computer programming': 1.8232044198895028,
 'data processing': 1.8232044198895028,
 'integrated systems de

Not a large difference between the minimum and maximum categories according to this heuristic. Plan to scrap this, and rank categories better once total transactions per company are calculated.