In [144]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import feature as H
# First lets reed the datasets
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [145]:
merchant_info = spark.read.parquet('../data/curated/full_dataset_Variation/')

In [146]:
linkage = spark.read.csv('../data/curated/sector_lookup_table.csv', header=True)

In [147]:
# pre process industry
merchant_info = merchant_info.select('merchant_abn', 'Description').distinct()

In [148]:
merchant_info = merchant_info.join(linkage, on= merchant_info.Description == linkage.Description, how='inner')

In [149]:
merchant_info = merchant_info.select('merchant_abn', 'sector').distinct()

In [150]:
# Read all our tables 
customer_scaled = spark.read.parquet('../data/Normalised/customer_scailed/')
industry_raw = spark.read.parquet('../data/Normalised/industry_scailed/')
merchant_scaled = spark.read.parquet('../data/Normalised/merchant_scailed/')
revenue_scaled = spark.read.parquet('../data/Normalised/revenue_scailed/')

In [151]:
# slip ups 
industry_scaled = merchant_info.join(industry_raw, on='sector').drop('sector')
revenue_scaled = revenue_scaled.select('merchant_abn', 'Revenue_Growth_Avg', 'Total_Revenue')

In [152]:
from operator import add
from functools import reduce
# For each table we need to perform the following steps
def scroring_process(dataset):
    # first subtract 1 from each cell then square
    for col in dataset.columns[1:]:
        dataset = dataset.withColumn(col, F.pow((F.col(col) - 1),2))
    # sum the columns 
    new_df = dataset.withColumn('total',F.round(reduce(add, [F.col(x) for x in dataset.columns[1:]]), 4))
    return new_df


In [153]:
# Define our scaliong function
def feature_standardisation(dataset): 
    values = dataset.select(F.max('total').alias('high'), F.min('total').alias('low'))
    dataset = dataset.withColumn('total', F.round((F.col('total') - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]), 4))
    return dataset.select('merchant_abn', 'total')

In [154]:
customer_scaled = scroring_process(customer_scaled)
industry_scaled = scroring_process(industry_scaled)
merchant_scaled = scroring_process(merchant_scaled)
revenue_scaled = scroring_process(revenue_scaled)

In [155]:
revenue_scaled

merchant_abn,Revenue_Growth_Avg,Total_Revenue,total
12516851436,0.19509889,0.99281296,1.1879
15613631617,0.2695686399999999,0.99460729,1.2642
19839532017,0.2752051599999999,0.984064,1.2593
34440496342,0.2849424400000001,0.99840064,1.2833
35344855546,0.25080064,0.98783721,1.2386
37935728745,0.07436529,0.97792321,1.0523
38700038932,0.24970009,0.00876096,0.2585
41956465747,0.23020804,0.99261369,1.2228
48214071373,0.2677027600000001,0.97911025,1.2468
52763133264,0.23882769,0.99740169,1.2362


In [156]:
# rescale them 
customer_scaled = customer_scaled.select('merchant_abn','total')
customer_scaled = feature_standardisation(customer_scaled)
customer_scaled = customer_scaled.withColumnRenamed('total','Customer_score')
revenue_scaled = revenue_scaled.select('merchant_abn','total')
revenue_scaled = scroring_process(revenue_scaled)
revenue_scaled = revenue_scaled.withColumnRenamed('total','Revenue_score')
industry_scaled = industry_scaled.select('merchant_abn','total')
industry_scaled = scroring_process(industry_scaled)
industry_scaled = industry_scaled.withColumnRenamed('total','Industry_score')
merchant_scaled = merchant_scaled.select('merchant_abn','total')
merchant_scaled = scroring_process(merchant_scaled)
merchant_scaled = merchant_scaled.withColumnRenamed('total','Merchant_score')

In [157]:
# Finally join 
Full_scores = customer_scaled.join(revenue_scaled, on='merchant_abn', how='inner').join(industry_scaled, on='merchant_abn', how='inner').join(merchant_scaled, on='merchant_abn', how='inner')

In [158]:
# now scale and done 
Full_scores = scroring_process(Full_scores)

In [160]:
Full_scores = Full_scores.join(merchant_info.join(industry_raw, on='sector').select('merchant_abn', 'sector').distinct(), on='merchant_abn', how='left')

In [161]:
Full_scores.write.parquet('../data/Normalised/Final_scores', mode='overwrite')

                                                                                