In [48]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import feature as H
# First lets reed the datasets
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [49]:
merchant_info = spark.read.parquet('../data/curated/full_dataset_Variation/')

In [50]:
linkage = spark.read.csv('../data/curated/sector_lookup_table.csv', header=True)

In [51]:
# pre process industry
merchant_info = merchant_info.select('merchant_abn', 'Description').distinct()

In [52]:
merchant_info = merchant_info.join(linkage, on= merchant_info.Description == linkage.Description, how='inner')

In [53]:
merchant_info = merchant_info.select('merchant_abn', 'sector').distinct()

In [54]:
# Read all our tables 
customer_scaled = spark.read.parquet('../data/Normalised/customer_scaled/')
customer_scaled = customer_scaled.fillna(0.5)
industry_raw = spark.read.parquet('../data/Normalised/industry_scailed/')
industry_raw = industry_raw.fillna(0.5)
merchant_scaled = spark.read.parquet('../data/Normalised/merchant_scaled/')
merchant_scaled = merchant_scaled.fillna(0.5)
revenue_scaled = spark.read.parquet('../data/Normalised/revenue_scailed/')
revenue_scaled = revenue_scaled.fillna(0.5)

In [55]:
customer_scaled

merchant_abn,Unique_Customers,Transaction_per_User,customer_wealth,Loyal_AVG,Customer_Entropy
10255988167,0.0338,0.0018,0.418,0.0188,0.0842
10430380319,0.0064,0.0,0.4125,0.0061,0.0
10618089367,0.1325,0.0064,0.4236,0.0262,0.6354
11215815177,0.0631,0.0027,0.4205,0.0215,0.239
11237511112,0.5782,0.0445,0.4229,0.048,0.6983
11355018654,0.106,0.0054,0.4246,0.0105,0.5049
11411018019,0.0004,0.0,0.351,0.0421,0.0
11470993597,0.0176,0.0009,0.4416,0.0073,0.0156
11563852275,0.0809,0.0036,0.4238,0.0043,0.3546
11566786699,0.8455,0.108,0.4237,0.0157,0.4624


In [56]:
merchant_scaled

merchant_abn,postcode_entropy,Customer_Growth_Avg,Total_Business_Days
34440496342,0.3315,0.3917,0.0007
35344855546,0.1248,0.4575,0.0053
37935728745,0.6698,0.7152,0.0
38700038932,0.0295,0.4225,0.0246
38986645707,0.5364,0.5512,0.0001
41956465747,0.3028,0.4328,0.001
35079621278,0.5876,0.4143,0.0001
35769589414,0.2178,0.4632,0.002
37470904997,0.4963,0.5343,0.0002
39759375662,0.4393,0.4188,0.0003


In [57]:
# slip ups 
industry_scaled = merchant_info.join(industry_raw, on='sector').drop('sector')
revenue_scaled = revenue_scaled.select('merchant_abn', 'Revenue_Growth_Avg', 'Total_Revenue')

In [58]:
customer_scaled

merchant_abn,Unique_Customers,Transaction_per_User,customer_wealth,Loyal_AVG,Customer_Entropy
10255988167,0.0338,0.0018,0.418,0.0188,0.0842
10430380319,0.0064,0.0,0.4125,0.0061,0.0
10618089367,0.1325,0.0064,0.4236,0.0262,0.6354
11215815177,0.0631,0.0027,0.4205,0.0215,0.239
11237511112,0.5782,0.0445,0.4229,0.048,0.6983
11355018654,0.106,0.0054,0.4246,0.0105,0.5049
11411018019,0.0004,0.0,0.351,0.0421,0.0
11470993597,0.0176,0.0009,0.4416,0.0073,0.0156
11563852275,0.0809,0.0036,0.4238,0.0043,0.3546
11566786699,0.8455,0.108,0.4237,0.0157,0.4624


In [59]:
merchant_scaled

merchant_abn,postcode_entropy,Customer_Growth_Avg,Total_Business_Days
34440496342,0.3315,0.3917,0.0007
35344855546,0.1248,0.4575,0.0053
37935728745,0.6698,0.7152,0.0
38700038932,0.0295,0.4225,0.0246
38986645707,0.5364,0.5512,0.0001
41956465747,0.3028,0.4328,0.001
35079621278,0.5876,0.4143,0.0001
35769589414,0.2178,0.4632,0.002
37470904997,0.4963,0.5343,0.0002
39759375662,0.4393,0.4188,0.0003


In [60]:
from operator import add
from functools import reduce
# For each table we need to perform the following steps
def scroring_process(dataset):
    # first subtract 1 from each cell then square
    for col in dataset.columns[1:]:
        dataset = dataset.withColumn(col, F.pow((F.col(col) - 1),2))
    # sum the columns 
    new_df = dataset.withColumn('total',F.round(reduce(add, [F.col(x) for x in dataset.columns[1:]]), 8))
    return new_df


In [61]:
# Define our scaliong function
def feature_standardisation(dataset): 
    values = dataset.select(F.max('total').alias('high'), F.min('total').alias('low'))
    dataset = dataset.withColumn('total', F.round((F.col('total') - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]), 9))
    return dataset.select('merchant_abn', 'total')

In [62]:
customer_scaled = scroring_process(customer_scaled)
industry_scaled = scroring_process(industry_scaled)
merchant_scaled = scroring_process(merchant_scaled)
revenue_scaled = scroring_process(revenue_scaled)

In [63]:
# rescale them 
customer_scaled = customer_scaled.select('merchant_abn','total')
customer_scaled = feature_standardisation(customer_scaled)
customer_scaled = customer_scaled.withColumnRenamed('total','Customer_score')
revenue_scaled = revenue_scaled.select('merchant_abn','total')
revenue_scaled = feature_standardisation(revenue_scaled)
revenue_scaled = revenue_scaled.withColumnRenamed('total','Revenue_score')
industry_scaled = industry_scaled.select('merchant_abn','total')
industry_scaled = feature_standardisation(industry_scaled)
industry_scaled = industry_scaled.withColumnRenamed('total','Industry_score')
merchant_scaled = merchant_scaled.select('merchant_abn','total')
merchant_scaled = feature_standardisation(merchant_scaled)
merchant_scaled = merchant_scaled.withColumnRenamed('total','Merchant_score')

                                                                                

In [64]:
# Industry scale
Industry_scale = 1
industry_scaled = industry_scaled.withColumn('Industry_score', F.col('Industry_score') * Industry_scale)

In [65]:
# Finally join 
Full_scores = customer_scaled.join(revenue_scaled, on='merchant_abn', how='inner').join(industry_scaled, on='merchant_abn', how='inner').join(merchant_scaled, on='merchant_abn', how='inner')

In [66]:
# now scale and done 
Full_scores = scroring_process(Full_scores)

In [67]:
Full_scores = Full_scores.join(merchant_info.join(industry_raw, on='sector').select('merchant_abn', 'sector').distinct(), on='merchant_abn', how='left')

In [68]:
Full_scores.write.parquet('../data/Normalised/Final_scores', mode='overwrite')

                                                                                

In [69]:
Full_scores.orderBy('total')

                                                                                

merchant_abn,Customer_score,Revenue_score,Industry_score,Merchant_score,total,sector
47830349274,0.0182565364007959,0.1532492909508383,0.0,0.0580207338295043,0.22952656,Administrative_Su...
31420388495,0.0179025526597772,0.1753600992413972,0.0,0.0489187871454945,0.24218144,Administrative_Su...
67200153151,0.0322018779226995,0.1809177871535078,0.0,0.0383411734882145,0.25146084,Administrative_Su...
25714080598,0.0281920198481824,0.1829086880891319,0.0,0.0414147319231855,0.25251544,Administrative_Su...
33344911835,0.0192704840445125,0.1787017655270918,0.0,0.0561812061630625,0.25415346,Administrative_Su...
34940973001,0.0232777873740775,0.1926495409373088,0.0,0.0389258982582397,0.25485323,Administrative_Su...
32258116290,0.0184267221288739,0.1952062078183892,0.0,0.0429496050934082,0.25658254,Administrative_Su...
73416415122,0.0143718327476912,0.1710646969505011,0.0,0.0713686426994063,0.25680517,Administrative_Su...
95834237054,0.0302808057226884,0.1882386490599075,0.0,0.0395259600460314,0.25804541,Administrative_Su...
75900778714,0.0194207106944041,0.1941867077544316,0.0,0.0454952190231429,0.25910264,Administrative_Su...
