In [1]:
import pandas as pd
import numpy as np
import os
import re

# Set working directory
if not "/data/tables" in os.getcwd():
    os.chdir("../data/curated")

from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.3.0
      /_/

Using Python version 3.9.12 (main, Apr  4 2022 05:22:27)
Spark context Web UI available at http://192.168.1.6:4041
Spark context available as 'sc' (master = local[*], app id = local-1664697996738).
SparkSession available as 'spark'.


# Read dataset

In [None]:
final = pd.read_parquet('./final_merchant_statistics')

## If code above cannot read the folder, run this code below

In [8]:
# import modules
from pyspark.sql import SparkSession
import functools
 
# explicit function
def unionAll(dfs):
    return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)

path = "./final_merchant_statistics/final_merchant_statistics/"
list_dir = os.listdir(path)
list_dir = [i for i in list_dir if i[0]=="p"]
list_files = list_dir

final = spark.read.parquet(path + list_files[0])
for i in list_files[1:]:
    tmp = spark.read.parquet(path + i)
    final = unionAll([final, tmp])
    
final = final.toPandas()
final

Unnamed: 0,merchant_abn,name,tags,tag,revenue_level,take_rate,sales_revenue,no_orders,avg_daily_rev,avg_value_per_order,avg_daily_order,avg_daily_commission,avg_commission_per_order,sa2_region_count,median_customer_income,returning_customer,mean_spending,std_spending,vip_customer
0,10023283211,Felis Limited,"((furniture, home furnishings and equipment sh...","furniture, home furnishings and equipment shop...",e,0.18,7.032777e+05,3261.0,1162.442498,215.663205,5.390083,2.092396,0.388194,1717.0,800.259547,11.0,231.951752,155.424439,137.0
1,10346855916,Odio Institute,"((equipment, tool, furniture, and appliance r...","equipment, tool, furniture, and appliance ren...",b,3.57,6.775178e+03,6.0,11.198642,1129.196413,0.009917,0.399792,40.312312,7.0,795.400000,0.0,1129.196413,717.988423,0.0
2,10385163239,Sed Et Company,"([florists supplies, nursery stock, and flower...","florists supplies, nursery stock, and flowers",a,6.61,2.519346e+04,74.0,41.642088,340.452204,0.122314,2.752542,22.503891,132.0,800.853287,0.0,345.115933,203.496326,3.0
3,10648956813,Proin Nisl Institute,"[(computeRs, computer peripheral equipment, an...","computers, computer peripheral equipment, and ...",a,6.66,1.419600e+06,21981.0,2346.446772,64.583063,36.332231,156.273355,4.301232,2208.0,835.366766,1574.0,98.720466,57.083817,710.0
4,10714068705,Sollicitudin Commodo Ipsum Industries,"([furniture, home furnishings and equipment sh...","furniture, home furnishings and equipment shop...",c,2.51,4.355100e+05,3438.0,719.851158,126.675378,5.682645,18.068264,3.179552,1715.0,802.506127,10.0,135.209547,113.737044,145.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,98850646791,Sapien Industries,"[[hobby, toy and game shops], [b], [take rate:...","hobby, toy and game shops",b,3.66,5.965319e+03,4.0,9.860032,1491.329779,0.006612,0.360877,54.582670,5.0,855.125000,0.0,1491.329779,312.966965,0.0
4022,98973094975,Ornare Fusce Inc.,"[(hobby, toy and game shops), (a), (take rate:...","hobby, toy and game shops",a,5.98,6.820456e+06,54611.0,11273.480318,124.891608,90.266116,674.154123,7.468518,2222.0,826.758965,9590.0,316.347662,206.620943,945.0
4023,99009287608,Nunc Risus LLP,"([computer programming , data processing, and ...","computer programming , data processing, and in...",b,3.15,4.803691e+04,260.0,79.399848,184.757339,0.429752,2.501095,5.819856,419.0,774.127507,0.0,185.470688,132.675435,10.0
4024,99420575685,Facilisi Consulting,"[[art dealers and galleries], [b], [take rate:...",art dealers and galleries,b,4.78,3.694443e+03,3.0,6.106517,1231.480862,0.004959,0.291891,58.864785,6.0,713.750000,0.0,1231.480862,461.033395,0.0


# standardisation with min-max scaling(normalisation)

#### Why using min-max scaling: It is one of the method of standardisation, and since ranking doesn't have negative value, min-max scaling is suitable because it shows from 0 to 1. And also, it can cover all the continuous variable.


In [9]:
final['revenue_level'] = np.where(final['revenue_level'] == "a", 1,
                                 np.where(final['revenue_level'] == "b", 2,
                                          np.where(final['revenue_level'] == "c", 3,
                                                   np.where(final['revenue_level'] == "d", 4,5))))

In [10]:
# min_max_scale
def min_max(data):
    #data = [i for i in data if np.isnan(i) == False]
    if np.max(data) == np.min(data) : return np.array(data)
    else : return (data - np.min(data)) / (np.max(data) - np.min(data))

In [11]:
list_name = ['revenue_level','take_rate',\
       'sales_revenue', 'no_orders', 'avg_daily_rev', 'avg_value_per_order',\
       'avg_daily_order', 'avg_daily_commission', 'avg_commission_per_order',\
       'sa2_region_count', 'median_customer_income', 'returning_customer',\
       'mean_spending', 'std_spending', 'vip_customer']

for i in list_name:
    final[i] = min_max(final[i])
    
final

Unnamed: 0,merchant_abn,name,tags,tag,revenue_level,take_rate,sales_revenue,no_orders,avg_daily_rev,avg_value_per_order,avg_daily_order,avg_daily_commission,avg_commission_per_order,sa2_region_count,median_customer_income,returning_customer,mean_spending,std_spending,vip_customer
0,10023283211,Felis Limited,"((furniture, home furnishings and equipment sh...","furniture, home furnishings and equipment shop...",1.00,0.011594,0.071679,0.011340,0.071679,0.100703,0.011340,0.002015,0.002522,0.772625,0.089501,0.000457,0.108464,0.112990,0.128518
1,10346855916,Odio Institute,"((equipment, tool, furniture, and appliance r...","equipment, tool, furniture, and appliance ren...",0.25,0.502899,0.000654,0.000017,0.000654,0.546974,0.000017,0.000375,0.306225,0.002701,0.088308,0.000000,0.546875,0.521963,0.000000
2,10385163239,Sed Et Company,"([florists supplies, nursery stock, and flower...","florists supplies, nursery stock, and flowers",0.00,0.943478,0.002532,0.000254,0.002532,0.161664,0.000254,0.002654,0.170756,0.058982,0.089646,0.000000,0.163758,0.147938,0.002814
3,10648956813,Proin Nisl Institute,"[(computeRs, computer peripheral equipment, an...","computers, computer peripheral equipment, and ...",0.00,0.950725,0.144726,0.076455,0.144726,0.026899,0.076455,0.151369,0.032289,0.993697,0.098115,0.065398,0.043365,0.041499,0.666041
4,10714068705,Sollicitudin Commodo Ipsum Industries,"([furniture, home furnishings and equipment sh...","furniture, home furnishings and equipment shop...",0.50,0.349275,0.044374,0.011955,0.044374,0.057232,0.011955,0.017490,0.023756,0.771724,0.090052,0.000415,0.061194,0.082685,0.136023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,98850646791,Sapien Industries,"[[hobby, toy and game shops], [b], [take rate:...","hobby, toy and game shops",0.25,0.515942,0.000571,0.000010,0.000571,0.723881,0.000010,0.000337,0.414780,0.001801,0.102963,0.000000,0.723820,0.227521,0.000000
4022,98973094975,Ornare Fusce Inc.,"[(hobby, toy and game shops), (a), (take rate:...","hobby, toy and game shops",0.00,0.852174,0.695476,0.189955,0.695476,0.056360,0.189955,0.653038,0.056382,1.000000,0.096003,0.398454,0.149701,0.150209,0.886492
4023,99009287608,Nunc Risus LLP,"([computer programming , data processing, and ...","computer programming , data processing, and in...",0.25,0.442029,0.004862,0.000901,0.004862,0.085605,0.000901,0.002410,0.043841,0.188204,0.083089,0.000000,0.085752,0.096452,0.009381
4024,99420575685,Facilisi Consulting,"[[art dealers and galleries], [b], [take rate:...",art dealers and galleries,0.25,0.678261,0.000340,0.000007,0.000340,0.596942,0.000007,0.000270,0.447354,0.002251,0.068274,0.000000,0.596853,0.335162,0.000000


# NaN -> 0

In [12]:
final = final.fillna(0) 

# In Progress!!!
# first, assume everything has same weight

In [13]:
final['final_score'] = final.iloc[:,4:16].sum(axis = 1) + final["vip_customer"] - final["std_spending"]
final = final.sort_values(by = "final_score", ascending = False).reset_index(drop = False)

In [14]:
final["rank"] = range(1,(final.shape[0])+1)
final

Unnamed: 0,index,merchant_abn,name,tags,tag,revenue_level,take_rate,sales_revenue,no_orders,avg_daily_rev,...,avg_daily_commission,avg_commission_per_order,sa2_region_count,median_customer_income,returning_customer,mean_spending,std_spending,vip_customer,final_score,rank
0,1926,86578477987,Leo In Consulting,"[[watch, clock, and jewelry repair shops], [a]...","watch, clock, and jewelry repair shops",0.00,0.917391,0.972587,0.943093,0.972587,...,0.981953,0.016776,1.0,0.095327,0.999585,0.188658,0.104536,0.770169,8.520560,1
1,1327,45629217853,Lacus Consulting,"[[gift, Card, novelty, and souvenir shops], [a...","gift, card, novelty, and souvenir shops",0.00,0.997101,0.854946,0.753671,0.854946,...,0.937017,0.020115,1.0,0.095325,0.994848,0.165257,0.115821,0.868668,8.028740,2
2,3143,24852446429,Erat Vitae LLP,"[(florists supplies, nursery stock, and floWer...","florists supplies, nursery stock, and flowers",0.50,0.411594,0.886760,1.000000,0.886760,...,0.409354,0.006334,1.0,0.095328,1.000000,0.171588,0.092617,0.755159,7.868799,3
3,495,89726005175,Est Nunc Consulting,"((tent and awning shops), (a), (take rate: 6.01))",tent and awning shops,0.00,0.856522,0.908425,0.750944,0.908425,...,0.857266,0.018435,1.0,0.095327,0.994266,0.175907,0.103375,0.791745,7.844432,4
4,604,43186523025,Lorem Ipsum Sodales Industries,"([florists supplies, nurSery stock, and flower...","florists supplies, nursery stock, and flowers",0.25,0.633333,0.922006,0.698618,0.922006,...,0.647129,0.014877,1.0,0.095332,0.989447,0.178632,0.108223,0.783302,7.563786,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,965,93723130289,Quam Quis Diam Company,"([antique shops - sales, repairs, and restorat...","antique shops - sales, repairs, and restoratio...",0.25,0.449275,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.699275,4022
4022,4005,93915598279,Molestie Pharetra Nibh LLP,"([antique shops - sales, repairs, and restorat...","antique shops - sales, repairs, and restoratio...",0.50,0.198551,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.698551,4023
4023,1343,51527394775,Sollicitudin Orci Limited,"((antique shops - sales, repairs, aNd restorat...","antique shops - sales, repairs, and restoratio...",0.50,0.197101,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.697101,4024
4024,2736,54277261175,In Magna PC,"[(antique shops - sales, repairs, and resTorat...","antique shops - sales, repairs, and restoratio...",0.50,0.197101,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.697101,4025
