In [43]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import feature as H
# First lets reed the datasets
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [44]:
# Read all our tables 
customer_scaled = spark.read.parquet('../data/Normalised/customer_scaled/')
industry_scaled = spark.read.parquet('../data/Normalised/industry_scaled/')
merchant_scaled = spark.read.parquet('../data/Normalised/merchant_scaled/')
revenue_scaled = spark.read.parquet('../data/Normalised/revenue_scaled/')


In [45]:
from operator import add
from functools import reduce
# For each table we need to perform the following steps
def scroring_process(dataset):
    # first subtract 1 from each cell then square
    for col in dataset.columns[1:]:
        dataset = dataset.withColumn(col, F.pow((F.col(col) - 1),2))
    # sum the columns 
    new_df = dataset.withColumn('total',F.round(reduce(add, [F.col(x) for x in dataset.columns[1:]]), 4))
    return new_df


In [46]:
# Define our scaliong function
def feature_standardisation(dataset): 
    values = dataset.select(F.max('total').alias('high'), F.min('total').alias('low'))
    dataset = dataset.withColumn('total', F.round((F.col('total') - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]), 4))
    return dataset.select('merchant_abn', 'total')

In [47]:
customer_scaled = scroring_process(customer_scaled)
industry_scaled = scroring_process(industry_scaled)
merchant_scaled = scroring_process(merchant_scaled)
revenue_scaled = scroring_process(revenue_scaled)

In [48]:
customer_scaled

merchant_abn,postcode_entropy,Customer_Growth_Avg,Total_Business_Days,total
34440496342,0.44689225,0.37002889,0.99860049,1.8155
35344855546,0.7659750399999999,0.29430625,0.98942809,2.0497
37935728745,0.10903204,0.08111104,1.0,1.1901
38700038932,0.94187025,0.33350625,0.95140516,2.2268
38986645707,0.21492496,0.2014214399999999,0.99980001,1.4161
41956465747,0.48608784,0.3217158399999999,0.998001,1.8058
35079621278,0.17007376,0.34304449,0.99980001,1.5129
35769589414,0.61183684,0.2881542399999999,0.996004,1.896
37470904997,0.25371369,0.21687649,0.99960004,1.4702
39759375662,0.3143844899999999,0.3377934399999999,0.99940009,1.6516


In [49]:
# rescale them 
customer_scaled = feature_standardisation(customer_scaled)
customer_scaled = customer_scaled.withColumnRenamed('total','Customer_score')
revenue_scaled = scroring_process(revenue_scaled)
revenue_scaled = revenue_scaled.withColumnRenamed('total','Revenue_score')
industry_scaled = scroring_process(industry_scaled)
industry_scaled = industry_scaled.withColumnRenamed('total','Industry_score')
merchant_scaled = scroring_process(merchant_scaled)
merchant_scaled = merchant_scaled.withColumnRenamed('total','Merchant_score')

In [50]:
# Finally join 
Full_scores = customer_scaled.join(revenue_scaled, on='merchant_abn', how='inner').join(industry_scaled, on='merchant_abn', how='inner').join(merchant_scaled, on='merchant_abn', how='inner')

In [52]:
# now scale and done 
Full_scores = scroring_process(Full_scores)

In [53]:
Full_scores

22/10/09 23:33:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

merchant_abn,Customer_score,Total_Revenue,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Growth_4,Growth_5,Growth_6,Growth_7,Growth_8,Growth_9,Revenue_Growth_Avg,Revenue_score,Portion_of_Total_Revenue,Portion_of_Total_Transactions,Average_Weighted_Revenue,survival_rate,Industry_score,postcode_entropy,Customer_Growth_Avg,Total_Business_Days,Merchant_score,total
12516851436,0.16516096,0.9998966955801654,2.354063362947056...,7.135920463268369...,9.387077750813639E19,1.592382109657036...,1.415270567888266E22,2.962326981006539...,1.901467352960560...,0.0656042210875184,0.2556298441417374,0.5530011928036459,0.6780183058788245,0.9441091194282173,0.7432275235261395,0.1239984970074982,1.47815716765936E25,0.0,0.0,1.0,0.0,4.0,0.4710940520533226,0.2800130063524531,0.999996082747356,0.12292036,1.501718135409322E25
15613631617,0.011880999999999997,0.9999418382034352,9.347192815418478E14,3.361604889727114...,1.420193256041672...,1.632989216176492E20,1.066136900089155...,4.290391388278361E21,1.303113571912672...,0.1693439578595016,0.4005926563236675,0.62766290945822,0.740292397747083,0.8352114624021277,0.8861936814404523,0.2175942873288384,1.409015304454287...,0.0,0.0,1.0,0.0,4.0,0.911276754007776,0.3109552278319725,0.9996944070190684,0.5168172100000001,1.427580806411168...
19839532017,0.04297329000000002,0.9994921523016989,1.233406410499186...,3.209939321587516...,1.598465849930851...,1.439608019591654E24,1.083210397046373...,5.23912238732868E25,1.251250843844476...,0.19140625,0.374445723292035,0.6749908044414974,0.7110167690585989,0.7993194223731048,0.9250301701483772,0.2253139252095186,1.461012618309702...,0.1435354318884218,0.4829679798505771,0.0129895710283159,0.8814532708812811,1.1994630400000004,0.7681706275876734,0.3303015722616091,0.9999501255437558,0.28270489,1.480007727220319...
34440496342,0.12517444,0.999994884101724,4.421031411025119E10,2.777587631879158...,2.102476027943141...,6.930203298496181E15,4.772869743706716E16,1.459125835520842...,6.304650020759623...,0.1416840281436957,0.3259969484249643,0.8738541728835035,0.7321417712610064,0.8861965514218967,0.8228900661927842,0.2388205412392999,6.587963277974416...,0.0,0.0,1.0,0.0,4.0,0.4817356870008988,0.363773517686948,0.999996082747356,0.1352768399999999,6.671279956222235...
35344855546,0.025217440000000018,0.9997041549631408,4.131094972148423E17,1.278355207433769...,2.726859062275868...,5.786200754269229E22,3.624111889531733...,1.944732533470578E24,8.987403344344514E24,0.1783746425538919,0.5113427486580417,0.5118348389283374,0.748112186579843,0.7789220694500912,0.8082675191243264,0.1924579698692848,6.708516696219247E26,0.0,0.0,1.0,0.0,4.0,0.8934641351750251,0.2520003165492412,0.999776481929382,0.42876304,6.822069338039282E26
37935728745,0.7649251600000001,0.9990254682303956,0.0,0.0,0.0,0.0,7.453447996497709E15,1.182079318012817...,1.834087317424041...,1.0,1.0,1.0,4.444409355224051e-08,3.268008791003169...,0.0861980059586094,0.0205063498755642,1.032571255799820...,0.0,0.0,1.0,0.0,4.0,0.0425085818416027,0.0242247680997814,1.0,0.4546804899999999,1.216098195474771...
38700038932,1.276899999999994...,0.0003043338021093782,9.879952219269599E34,4.561942969350385E37,1.589247853941863...,1.889970086499645E40,1.252427977239960...,6.606170420562422E41,2.606306585032650...,0.1481700512796872,0.4274649998213292,0.6193994207803898,0.7362937299586353,0.781995187252582,0.838392451455762,0.1910127418816963,2.184810796555127E44,0.0,0.0,1.0,0.0,4.0,0.993253282429309,0.3088981680678108,0.9952826595368764,0.912025,2.218937807472737...
41956465747,0.13097161,0.9998908878256968,9.752959721979141E14,3.416582495991781...,2.181445149981902...,9.412281567655927E20,8.739391954533618E21,4.263234806966908...,1.650426294502938...,0.0609930706517109,0.3412498035739627,0.8220611122288946,0.6693292178295999,0.7979427234164861,0.8423005331023574,0.1659913320762603,1.437109829501490...,0.0,0.0,1.0,0.0,4.0,0.5415404087099086,0.2915250509731389,0.999992008013968,0.1395022499999999,1.458867545471896...
48214071373,0.07851204,0.9991274271188236,4.0800995574844E19,5.46858389521511E21,3.432585628200103E23,2.890854901462495E24,2.545985800138905...,1.436593185322222...,4.333289843623377...,0.2527755590107061,0.3422698210155928,0.6884055059212031,0.6793254684864383,0.7687738511498559,0.8871279483632802,0.215055485332995,4.052314243878169E28,0.0,0.0,1.0,0.0,4.0,0.7066476999288337,0.2700200852416167,0.9999711750536584,0.20866624,4.112883022252682E28
52763133264,0.28793956,0.9999864976158668,9.897718770142407E11,6.57207328476689E13,9.026267443987915E14,5.402949230491965...,5.808343277644170...,1.897320105760942...,4.544363045677645E19,0.3477919559617758,0.5995403004105863,0.3537563587476802,0.278518399481231,0.8739505076098816,0.9250071209008116,0.176918420507743,3.964481203164289...,0.0,0.0,1.0,0.0,4.0,0.34857216,0.1743017076013763,0.999998720512358,0.1206172899999999,4.034761376785874E21
