In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from pyspark.sql.functions import round
from pyspark.sql import Window
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import log
from pyspark.sql.functions import regexp_replace
import scipy.stats as st
import pandas as pd
from operator import add
from functools import reduce

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/10/05 17:22:22 WARN Utils: Your hostname, Zhangs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.12.9.174 instead (on interface en0)
22/10/05 17:22:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/10/05 17:22:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/05 17:22:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Read processed Dataframe for ranking
final = spark.read.parquet('../data/curated/merchant_info.parquet')

In [3]:
# Select all the numeric features
col_list_1 = final.columns

del_col_list = ['Store_type', 'Revenue_levels', 'merchant_abn','postcode']

for i in del_col_list:
    col_list_1.remove(i)

In [4]:
# Input origginal dataframe, return new dataframe that mapping numeric and ordinal data to 0-100 
# in order to make all the features in same scale for calculating the score
def get_map_df(unsort_df, col_list_1):
    max_list = []
    min_list = []
    # Calculate max and min value in each numeric features
    for i in col_list_1:
        max_num = float(final.agg({i: "max"}).collect()[0][0])
        min_num = float(final.agg({i: "min"}).collect()[0][0])
        max_list.append(max_num)
        min_list.append(min_num)
    # Map each feature to 0-100
    for i in range(0,11):
        unsort_df = unsort_df.withColumn(col_list_1[i], (col(col_list_1[i]) - min_list[i])*100/(max_list[i]-min_list[i]))
    # Times -1 to fraud features because for fraud less is better
    unsort_df = unsort_df.withColumn('avg_prob_fraud_cus', (unsort_df.avg_prob_fraud_cus *(-1))) \
        .withColumn('prob_num_of_fraud', (unsort_df.prob_num_of_fraud*(-1)))
    unsort_df = unsort_df.select('merchant_abn','Store_type', 'Revenue_levels', 'Take_rate', \
        'count_of_bigorder', 'Avg_amount_monthly', 'Avg_count_monthly', 'Order_avg_value',	\
        'avg_prob_fraud_cus', 'prob_num_of_fraud', 'count_cus_per_mon', 'fix_cus_num', 'avg_income_percentage', 'avg_age_percentage')
    # Assign a value to the each revenue level, also in the range 0-100
    reve_list = ['a', 'b', 'c', 'd', 'e']
    reve_num = [0, 25, 50, 75, 100]
    for i in range(len(reve_list)):
        unsort_df = unsort_df.withColumn('Revenue_levels', when(col('Revenue_levels') == reve_list[i], reve_num[i]).otherwise(col('Revenue_levels')))
    return unsort_df

In [5]:
# final_1 is the dataframe after mapping
final_1 = get_map_df(final, col_list_1)

                                                                                

### Baseline Model

In [6]:
# purely add all the scores together without weighting as baseline
pure_add = final_1.withColumn('pure_add_score',reduce(add, [F.col(x) for x in final_1.columns[2:14]]))
pure_add_rank = pure_add.sort(pure_add.pure_add_score.desc())

In [7]:
pure_add_rank

merchant_abn,Store_type,Revenue_levels,Take_rate,count_of_bigorder,Avg_amount_monthly,Avg_count_monthly,Order_avg_value,avg_prob_fraud_cus,prob_num_of_fraud,count_cus_per_mon,fix_cus_num,avg_income_percentage,avg_age_percentage,pure_add_score
64203420245,tent and awning s...,50,38.56120646027543,0.0,81.5055972932642,100.0,0.0458055442830304,-10.300261844375468,-0.3405174203729081,100.0,100.0,52.02931741035037,67.15831110113018,578.6594585445548
49891706470,tent and awning s...,0,74.1488063000034,0.0,77.2791473710687,94.82354711438376,0.0458055442830304,-10.300308583547402,-0.3405598471633856,99.98803979757358,99.1009623501604,41.455465790903375,51.82701782093047,528.0279236585959
24852446429,florists supplies...,50,31.55172984355722,0.0,64.86912042945457,96.61741115924382,0.0359735461414708,-10.300004568869593,-0.3405457037252377,99.99177736083185,99.44284990714164,40.379622389316296,53.43330702446725,525.6812413875592
86578477987,"watch, clock, and...",0,75.35417776195295,0.0,70.83793742893616,90.88766210280154,0.0433571839615048,-10.300085255309648,-0.340531561461794,99.99626243674174,98.53959142326524,32.77999166601939,56.78988503398783,514.5882482208949
43186523025,florists supplies...,25,53.21403394815223,0.0,71.96055911927039,67.53381484457181,0.0629633449614383,-10.303426592884689,-0.3407721397996924,99.92524873483484,86.80567280094547,52.02931741035037,67.15831110113018,513.0457225715323
46804135891,"opticians, optica...",50,21.25530948937898,0.0,43.83968606182158,76.94102315616259,0.0290140337314649,-10.300587786087322,-0.3406447324692588,99.96262436741742,93.36062806010467,52.02931741035037,67.15831110113018,493.9346811615407
89726005175,tent and awning s...,0,70.45136476271153,0.0,70.85737834776997,72.5736692868167,0.0568520833714885,-10.299941866865254,-0.3406447324692588,99.96262436741742,90.58754009792337,40.379622389316296,53.43330702446725,487.6617717604595
64403598239,music shops - mus...,0,100.0,0.0,95.8053573651085,43.53873487339622,0.1406554087074874,-10.30317131520297,-0.3392670157068063,99.14484552651057,51.39709606618268,36.9926513640861,54.332438369538686,470.7093406426205
45629217853,"gift, card, novel...",0,60.3624790406334,0.0,52.26523058932152,74.90398765299163,0.0377857183479544,-10.29972066841835,-0.3405739917763841,99.98355472166368,92.48269458044908,35.39682507503507,59.72427375677006,464.5165364750176
80324045558,"gift, card, novel...",50,10.89938688399918,0.0,44.81109045319076,64.56764763537672,0.0375350987874832,-10.302503940015352,-0.3408854707960922,99.89235817816218,84.09589734931623,52.02931741035037,67.15831110113018,462.8481546995016


### Entropy Model

In [8]:
# All the features' name list without 2 categorical features which are useless for ranking
col_list_2 = final_1.columns
del_col_list = ['Store_type', 'merchant_abn']

for i in del_col_list:
    col_list_2.remove(i)

In [9]:
# Input the mapped dataframe and return the entropy of each feature as list
def get_entropy(df,col_list_2):
    sum_num_list = []
    for i in col_list_2:
        sum_num = float(df.agg({i: "sum"}).collect()[0][0])
        sum_num_list.append(sum_num)
    for i in range(0,len(sum_num_list)):
        df = df.withColumn(col_list_2[i], (col(col_list_2[i])/sum_num_list[i]))

    k = 1/np.log(df.count())

    for i in col_list_2:
        df = df.withColumn(i, when((col(i) != 0), (col(i) * log(col(i)) *(-k))).otherwise(0))
    
    weight_num = []
    for i in col_list_2:
        a = 1 - float(df.agg({i: "sum"}).collect()[0][0])
        weight_num.append(a)

    entropy = []
    total = sum(weight_num)
    for i in weight_num:
        a = i/total
        entropy.append(a)
    return entropy

In [10]:
# Return the the dataframe with the scores sorted of 4026 companies that weighted by entropy
def get_rank_df(unsort_df,col_list_1,col_list_2):
    unsort_map = get_map_df(unsort_df, col_list_1)
    unsort_entropy = unsort_map.withColumn('avg_prob_fraud_cus', (unsort_map.avg_prob_fraud_cus *(-1)))\
    .withColumn('prob_num_of_fraud', (unsort_map.prob_num_of_fraud*(-1)))
    entropy = get_entropy(unsort_entropy,col_list_2)
    unsorted_entropy_score = unsort_map.withColumn('entropy_score', unsort_map.Revenue_levels * entropy[0]*12+unsort_map.Take_rate * entropy[1]*12 + unsort_map.count_of_bigorder * entropy[2]*12 +\
                            unsort_map.Avg_amount_monthly *entropy[3]*12+ unsort_map.Avg_count_monthly *entropy[4]*12 +\
                            unsort_map.Order_avg_value *entropy[5]*12 + unsort_map.avg_prob_fraud_cus * entropy[6]*12+ unsort_map.prob_num_of_fraud * entropy[7]*12+\
                            unsort_map.count_cus_per_mon * entropy[8]*12 + unsort_map.fix_cus_num  * entropy[9]*12+\
                            unsort_map.avg_income_percentage * entropy[10]*12 + unsort_map.avg_age_percentage * entropy[11]*12)
    
    sorted_entropy_rank_pd = unsorted_entropy_score.toPandas()
    sorted_entropy_rank_pd['entropy_rank'] = sorted_entropy_rank_pd.entropy_score.rank(axis=0,method='min', ascending=False)
    sorted_entropy_rank_pd['entropy_rank'] = sorted_entropy_rank_pd.entropy_rank.astype(int)
    sorted_entropy = spark.createDataFrame(sorted_entropy_rank_pd)
    sorted_entropy = sorted_entropy.sort(sorted_entropy.entropy_rank)
    return sorted_entropy

In [11]:
# Sort the original dataframe use entropy weight method and the mapping method is min_max 0-100
entropy_score = get_rank_df(final,col_list_1,col_list_2)
entropy_score

merchant_abn,Store_type,Revenue_levels,Take_rate,count_of_bigorder,Avg_amount_monthly,Avg_count_monthly,Order_avg_value,avg_prob_fraud_cus,prob_num_of_fraud,count_cus_per_mon,fix_cus_num,avg_income_percentage,avg_age_percentage,entropy_score,entropy_rank
64203420245,tent and awning s...,50,38.56120646027543,0.0,81.5055972932642,100.0,0.0458055442830304,-10.300261844375468,-0.3405174203729081,100.0,100.0,52.02931741035037,67.15831110113018,647.094180410318,1
49891706470,tent and awning s...,0,74.1488063000034,0.0,77.2791473710687,94.82354711438376,0.0458055442830304,-10.300308583547402,-0.3405598471633856,99.98803979757358,99.1009623501604,41.455465790903375,51.82701782093047,643.4320662926029,2
86578477987,"watch, clock, and...",0,75.35417776195295,0.0,70.83793742893616,90.88766210280154,0.0433571839615048,-10.300085255309648,-0.340531561461794,99.99626243674174,98.53959142326524,32.77999166601939,56.78988503398783,632.505838277509,3
24852446429,florists supplies...,50,31.55172984355722,0.0,64.86912042945457,96.61741115924382,0.0359735461414708,-10.300004568869593,-0.3405457037252377,99.99177736083185,99.44284990714164,40.379622389316296,53.43330702446725,621.8020298840142,4
89726005175,tent and awning s...,0,70.45136476271153,0.0,70.85737834776997,72.5736692868167,0.0568520833714885,-10.299941866865254,-0.3406447324692588,99.96262436741742,90.58754009792337,40.379622389316296,53.43330702446725,580.2161522110965,5
45629217853,"gift, card, novel...",0,60.3624790406334,0.0,52.26523058932152,74.90398765299163,0.0377857183479544,-10.29972066841835,-0.3405739917763841,99.98355472166368,92.48269458044908,35.39682507503507,59.72427375677006,566.4009394443646,6
43186523025,florists supplies...,25,53.21403394815223,0.0,71.96055911927039,67.53381484457181,0.0629633449614383,-10.303426592884689,-0.3407721397996924,99.92524873483484,86.80567280094547,52.02931741035037,67.15831110113018,558.7461058955381,7
46804135891,"opticians, optica...",50,21.25530948937898,0.0,43.83968606182158,76.94102315616259,0.0290140337314649,-10.300587786087322,-0.3406447324692588,99.96262436741742,93.36062806010467,52.02931741035037,67.15831110113018,553.6244778202962,8
80324045558,"gift, card, novel...",50,10.89938688399918,0.0,44.81109045319076,64.56764763537672,0.0375350987874832,-10.302503940015352,-0.3408854707960922,99.89235817816218,84.09589734931623,52.02931741035037,67.15831110113018,500.0060163593861,9
68216911708,"computers, comput...",50,24.6246229233646,0.0,48.79395538285799,59.95229062958147,0.0457477089998448,-10.30112480308302,-0.3410130583049156,99.8549825455796,79.00135066689178,32.77999166601939,56.78988503398783,490.9621651375057,10


In [12]:
# Sort the original dataframe use subjective weight method and the mapping method is min_max 0-100
self_weight_score = final_1.withColumn('weight_score', 0.05*12*final_1.Revenue_levels + 0.05*12*final_1.Take_rate + 0.025*12*final_1.count_of_bigorder +\
                            0.1*12*final_1.Avg_amount_monthly + 0.1*12*final_1.Avg_count_monthly +\
                            0.025*12*final_1.Order_avg_value + 0.25*12*final_1.avg_prob_fraud_cus + 0.25*12*final_1.prob_num_of_fraud +\
                            0.02*12*final_1.count_cus_per_mon + 0.09*12*final_1.fix_cus_num +\
                            0.02*12*final_1.avg_income_percentage + 0.02*12*final_1.avg_age_percentage)

self_weight_score_pd = self_weight_score.toPandas()
self_weight_score_pd['weight_score'] = self_weight_score_pd.weight_score.rank(axis=0,method='min', ascending=False)
self_weight_score_pd['weight_score'] = self_weight_score_pd.weight_score.astype(int)
unsorted_self_weight_score = spark.createDataFrame(self_weight_score_pd)
sorted_self_weight_score= unsorted_self_weight_score.sort(unsorted_self_weight_score.weight_score)

In [13]:
sorted_self_weight_score

merchant_abn,Store_type,Revenue_levels,Take_rate,count_of_bigorder,Avg_amount_monthly,Avg_count_monthly,Order_avg_value,avg_prob_fraud_cus,prob_num_of_fraud,count_cus_per_mon,fix_cus_num,avg_income_percentage,avg_age_percentage,weight_score
64203420245,tent and awning s...,50,38.56120646027543,0.0,81.5055972932642,100.0,0.0458055442830304,-10.300261844375468,-0.3405174203729081,100.0,100.0,52.02931741035037,67.15831110113018,1
49891706470,tent and awning s...,0,74.1488063000034,0.0,77.2791473710687,94.82354711438376,0.0458055442830304,-10.300308583547402,-0.3405598471633856,99.98803979757358,99.1009623501604,41.455465790903375,51.82701782093047,2
24852446429,florists supplies...,50,31.55172984355722,0.0,64.86912042945457,96.61741115924382,0.0359735461414708,-10.300004568869593,-0.3405457037252377,99.99177736083185,99.44284990714164,40.379622389316296,53.43330702446725,3
86578477987,"watch, clock, and...",0,75.35417776195295,0.0,70.83793742893616,90.88766210280154,0.0433571839615048,-10.300085255309648,-0.340531561461794,99.99626243674174,98.53959142326524,32.77999166601939,56.78988503398783,4
43186523025,florists supplies...,25,53.21403394815223,0.0,71.96055911927039,67.53381484457181,0.0629633449614383,-10.303426592884689,-0.3407721397996924,99.92524873483484,86.80567280094547,52.02931741035037,67.15831110113018,5
89726005175,tent and awning s...,0,70.45136476271153,0.0,70.85737834776997,72.5736692868167,0.0568520833714885,-10.299941866865254,-0.3406447324692588,99.96262436741742,90.58754009792337,40.379622389316296,53.43330702446725,6
46804135891,"opticians, optica...",50,21.25530948937898,0.0,43.83968606182158,76.94102315616259,0.0290140337314649,-10.300587786087322,-0.3406447324692588,99.96262436741742,93.36062806010467,52.02931741035037,67.15831110113018,7
45629217853,"gift, card, novel...",0,60.3624790406334,0.0,52.26523058932152,74.90398765299163,0.0377857183479544,-10.29972066841835,-0.3405739917763841,99.98355472166368,92.48269458044908,35.39682507503507,59.72427375677006,8
64403598239,music shops - mus...,0,100.0,0.0,95.8053573651085,43.53873487339622,0.1406554087074874,-10.30317131520297,-0.3392670157068063,99.14484552651057,51.39709606618268,36.9926513640861,54.332438369538686,9
80324045558,"gift, card, novel...",50,10.89938688399918,0.0,44.81109045319076,64.56764763537672,0.0375350987874832,-10.302503940015352,-0.3408854707960922,99.89235817816218,84.09589734931623,52.02931741035037,67.15831110113018,10


### Z-score Model

In [14]:
# Calculate the mean and standard deviation for each feature as list
mean_num_list = []
sd_num_list = []

for i in col_list_1:
    mean_num = float(final.agg({i: "mean"}).collect()[0][0])
    sd_num = float(final.agg({i: "stddev_pop"}).collect()[0][0])
    mean_num_list.append(mean_num)
    sd_num_list.append(sd_num)


In [15]:
# Use z-score to map the data
final_3 = final

for i in range(0,len(mean_num_list)):
    final_3 = final_3.withColumn(col_list_1[i], (col(col_list_1[i]) - mean_num_list[i])/(sd_num_list[i]))


final_3 = final_3.withColumn('avg_prob_fraud_cus', (final_3.avg_prob_fraud_cus *(-1)))\
    .withColumn('prob_num_of_fraud', (final_3.prob_num_of_fraud*(-1)))
    
final_3 = final_3.select('merchant_abn','Store_type', 'Revenue_levels', 'Take_rate', 'count_of_bigorder', 'Avg_amount_monthly', 'Avg_count_monthly', 'Order_avg_value',	'avg_prob_fraud_cus', 'prob_num_of_fraud', 'count_cus_per_mon', 'fix_cus_num', 'avg_income_percentage', 'avg_age_percentage')

In [16]:
# Use pure add method to calculate the scores and sort
z_score = final_3.withColumn('z_score',reduce(add, [F.col(x) for x in final_1.columns[3:14]]))
z_score = z_score.sort(z_score.z_score.desc())

In [17]:
z_score

merchant_abn,Store_type,Revenue_levels,Take_rate,count_of_bigorder,Avg_amount_monthly,Avg_count_monthly,Order_avg_value,avg_prob_fraud_cus,prob_num_of_fraud,count_cus_per_mon,fix_cus_num,avg_income_percentage,avg_age_percentage,z_score
49891706470,tent and awning s...,a,9.39370860528608,-0.0918328294197233,7.707141404625247,19.32638638327331,-0.3548557186181547,0.1067913176631303,0.0225309593645039,6.133349959766747,19.66814807345557,-0.164200851207831,-0.8329184394255599,60.91424886476331
64203420245,tent and awning s...,c,4.693955815259257,-0.0918328294197233,8.151960181512433,20.39427314760475,-0.3548557186181547,0.1068076642179806,0.0225485045546128,6.134144193683256,19.847216652283382,0.513504945852304,0.6260841713844871,60.043806728314586
86578477987,"watch, clock, and...",a,9.552891801591215,-0.0918328294197233,7.029227036567473,18.51442506484265,-0.355326821831159,0.1068694244659899,0.0225426566436571,6.133895995584346,19.556335298882324,-0.7202346710758806,-0.3606271533208333,59.38816580293007
24852446429,florists supplies...,c,3.768273627238426,-0.0918328294197233,6.401030340510647,19.69645517610229,-0.356747550418408,0.1068976437361119,0.0225368082469743,6.133598157865656,19.736244575263328,-0.233154467754097,-0.6800559183815671,54.50324556298964
89726005175,tent and awning s...,a,8.905418828534971,-0.0918328294197233,7.0312731239212605,14.73630306462222,-0.3527301899484583,0.1069195731308939,0.0224958558647522,6.131662212694162,17.972461108686954,-0.233154467754097,-0.6800559183815671,53.54876036195136
19492220327,"jewelry, watch, c...",b,6.035585569855741,36.39025843995541,5.86026991377816,-0.1793531642299976,2.525384007785228,-1.015738518675706,0.1633662003684352,-0.3145884517986104,-0.0705897777106431,0.7003628208369278,1.900357747692172,51.99531478785711
43186523025,florists supplies...,b,6.629030593498561,-0.0918328294197233,7.147378964638136,13.696596151035658,-0.3515542866530067,0.1057008252871026,0.0224431677917549,6.129180231705069,17.219196101035653,0.513504945852304,0.6260841713844871,51.645728036156
45629217853,"gift, card, novel...",a,7.573065170683992,-0.0918328294197233,5.074515897912812,15.21704078114457,-0.3563988598513025,0.1069969350608445,0.0225251099961855,6.133052122048055,18.349934310065787,-0.5525149758260518,-0.0813760340952891,51.39500762771988
46804135891,"opticians, optica...",c,2.408512628891536,-0.0918328294197233,4.187757393653953,15.637275119286436,-0.358086670575058,0.1066936693855875,0.0224958558647522,6.131662212694162,18.524799401127694,0.513504945852304,0.6260841713844871,47.708865898146136
64403598239,music shops - mus...,a,12.807656729470995,-0.0918328294197233,9.656958916155856,8.746482844393956,-0.336605105956887,0.1057901060863474,0.0230655973474646,6.077356468652803,10.1665843273897,-0.450234296456408,-0.5944900788959575,46.110732678768144


### rank inside each column and add up the rank

In [18]:
# Rank each feature individually and summarize their rank as final rank
final_rank_coldf = final_1.drop("Take_rate", "Store_type", "Revenue_levels")
for col in final_rank_coldf.columns:
    if col == "merchant_abn":
        continue
    elif col in ["avg_prob_fraud_cus", "num_of_fraud"]: # columns that have number smaller = higher rank
        final_rank_coldf = final_rank_coldf.withColumn("rank_"+col, F.dense_rank().over(Window.orderBy(col))).drop(col)
    else: # columns that have number bigger = higher rank
        final_rank_coldf = final_rank_coldf.withColumn("rank_"+col, F.dense_rank().over(Window.orderBy(F.desc(col)))).drop(col)

numeric_col = final_rank_coldf.columns
numeric_col.remove("merchant_abn")
expression = '+'.join(numeric_col)

final_rank_coldf = final_rank_coldf.withColumn("sum_of_ranks", F.expr(expression))\
    .withColumn("rank", F.dense_rank().over(Window.orderBy("sum_of_ranks")))
sum_rank = final_rank_coldf.orderBy("rank")

### Study-score Model

In [19]:

def features_score(table_name, col_name):
    # sort the columns to give the largest value rank 1
    sorted_col = table_name[['merchant_abn', col_name]].sort_values(by=col_name, ascending=False)
    # a rank list, from 1 to the length of the table
    rank_list = [i for i in range(1, len(table_name)+1)]
    # as the target column has been sorted, add the rank list directly will be in the right position
    sorted_col['rank'] = rank_list
    # get the percentage, which is above % of merchant. Add 1 to the denominator, to avoid the last one get 0, 
    # z score will become -inifity
    sorted_col['percentage above the merchants'] = 1-sorted_col['rank']/(len(table_name)+1)
    sorted_col['z score'] = st.norm.ppf(sorted_col['percentage above the merchants'])
    mean = 30
    # adjust the sd from 7 to 5.75, to let the rank 1 merchant get a 50 study score, if sd = 7, 
    # the first one will become 54
    sd = 5.75
    sorted_col[col_name+'_raw_score'] = sorted_col['z score']*sd+mean
    return sorted_col[['merchant_abn', col_name+'_raw_score']]

In [20]:
#
def final_study_score(table_name):
    table_name = table_name.toPandas()
    # take the features that are numeric
    numeric_features_list = ['Take_rate', 'count_of_bigorder', 'Avg_amount_monthly', 'Avg_count_monthly', 'Order_avg_value', 
    'avg_prob_fraud_cus', 'prob_num_of_fraud', 'count_cus_per_mon', 'fix_cus_num', 'avg_income_percentage', 'avg_age_percentage']
    # for these two features, if the value is smaller, the merchant is better
    table_name[['prob_num_of_fraud']] = -abs(table_name[['prob_num_of_fraud']])
    table_name[['avg_prob_fraud_cus']] = -abs(table_name[['avg_prob_fraud_cus']])
    raw_score_ori = table_name[['merchant_abn', 'Store_type']]
    # change the level of revenue Level from character to score
    reve = table_name[['merchant_abn', 'Revenue_levels']]
    reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])
    reve = reve[['merchant_abn', 'Revenue_levels_raw_score']]
     # take the identification features
    raw_score_ori = pd.merge(raw_score_ori, reve, how='inner', on = 'merchant_abn')
    # merge the tables, take each feature's score
    for feature in numeric_features_list:
        feature_raw_score = features_score(table_name, feature)
        raw_score_ori = pd.merge(raw_score_ori, feature_raw_score, how='inner', on = 'merchant_abn')
    # sum the scores from numeric features
    raw_score_ori['raw_score_sum']= raw_score_ori.iloc[:, 2:].sum(axis=1)
    # add the rank column
    raw_score_ori['study_rank'] = raw_score_ori.raw_score_sum.rank(axis=0,method='min', ascending=False)
    raw_score_ori['study_rank'] = raw_score_ori.study_rank.astype(int)
    raw_score_spark = spark.createDataFrame(raw_score_ori)
    raw_score_spark= raw_score_spark.sort(raw_score_spark.study_rank)
    return raw_score_spark

In [21]:

study_score = final_study_score(final)
study_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])


merchant_abn,Store_type,Revenue_levels_raw_score,Take_rate_raw_score,count_of_bigorder_raw_score,Avg_amount_monthly_raw_score,Avg_count_monthly_raw_score,Order_avg_value_raw_score,avg_prob_fraud_cus_raw_score,prob_num_of_fraud_raw_score,count_cus_per_mon_raw_score,fix_cus_num_raw_score,avg_income_percentage_raw_score,avg_age_percentage_raw_score,raw_score_sum,study_rank
64203420245,tent and awning s...,30,42.99334815524945,33.83685377345363,47.07638029706966,50.02470848195272,20.68652596795065,30.06621516662849,27.98185605139667,50.02470848195272,50.02470848195272,33.598640744797706,35.032819778793446,451.3467653811978,1
60956456424,"gift, card, novel...",20,45.51973178482304,35.66741712255189,48.26565278725761,45.51973178482304,25.694001017784597,30.252408936336707,27.99327158762133,45.51973178482304,45.51973178482304,33.57255721107289,35.02757218924598,438.5518079911631,2
90543168331,"furniture, home f...",30,42.45568024760821,32.788050822597725,44.82438936979785,42.69012159086522,29.330987912799483,30.708666003855644,29.49649102769562,42.69012159086522,42.64956465459921,32.037187558939465,31.103740903412337,430.775001683036,3
80324045558,"gift, card, novel...",30,39.76688910928258,31.56344266989021,42.073053582736,46.3469541128586,19.488790262431817,29.994631320583583,27.943746458712525,46.3469541128586,46.3469541128586,33.52492711997095,35.06967099301057,428.466013855194,4
57223200264,art dealers and g...,50,30.61138989463629,34.07263853056922,39.93775721255277,30.708666003855644,38.09995995862406,33.637905590747216,43.33769714244496,30.697848035739447,34.81590129578033,33.434660476644765,27.01299669984943,426.3674208414442,5
46804135891,"opticians, optica...",30,41.28633027121891,25.769432206316324,42.00892801963713,47.3959147428638,17.759718162996375,30.05547720660715,27.95900104273217,46.80221176639948,47.3959147428638,33.5942883579639,35.03807156344337,425.06528808304233,6
68216911708,"computers, comput...",30,41.59722781762685,38.29778447279042,42.418553239399245,46.15290073650031,20.673212685035853,30.034001829524478,27.92847738848433,46.15290073650031,46.15290073650031,26.221082865151374,29.240816331517657,424.8698588390312,7
70009327857,lawn and garden s...,30,41.18966209763083,38.603371546018266,42.493333928661855,36.294462616606914,35.344738324311386,32.13658093438183,29.07439480654287,36.28144637888722,36.99945476408561,38.35908795006318,27.502357453409218,424.27889080059913,8
24852446429,florists supplies...,30,42.418553239399245,29.744008445267443,43.61662115002113,48.93140892503064,19.03906479264046,30.07337393262324,27.974241275167422,48.26565278725761,48.93140892503064,29.54677256005885,25.148426438138124,423.6895324706348,9
70033549200,tent and awning s...,20,42.310175520510526,39.63323842081445,41.737432392556144,37.775175810793584,32.98700330015057,32.04099895726783,28.896259096587663,37.78411456706677,37.269943020551885,30.690637431399285,41.82502644207625,422.9500049597749,10


# Compare two models

In [22]:
# The function for combining the rankings of the entropy model and study score model
def combine_two_models(entropy_score, study_score):
    study_score_rank = study_score.select(study_score.merchant_abn, study_score.study_rank)
    compare_rank = entropy_score.join(study_score_rank, entropy_score.merchant_abn == study_score_rank.merchant_abn).drop(study_score_rank.merchant_abn)
    compare_rank_final = compare_rank.withColumn('final_rank_score', (compare_rank.entropy_rank+compare_rank.study_rank)/2)
    compare_rank_final = compare_rank_final.sort(compare_rank_final.final_rank_score)

    compare_rank_final_pd = compare_rank_final.toPandas()
    compare_rank_final_pd['final_rank'] = compare_rank_final_pd.final_rank_score.rank(axis=0,method='min')
    compare_rank_final_pd['final_rank'] = compare_rank_final_pd.final_rank.astype(int)
    unsorted_compare_rank_final = spark.createDataFrame(compare_rank_final_pd)
    sorted_compare_rank_final= unsorted_compare_rank_final.sort(unsorted_compare_rank_final.final_rank)
    return sorted_compare_rank_final


In [23]:
# Combine the rankings of the entropy model and study score model and select the merchants with high rankings for both as the final top100
compare_rank_final = combine_two_models(entropy_score, study_score)
top_100 = compare_rank_final.limit(100)
top_100.select(top_100.merchant_abn, top_100.entropy_rank, top_100.study_rank, top_100.final_rank)

                                                                                

merchant_abn,entropy_rank,study_rank,final_rank
64203420245,1,1,1
80324045558,9,4,2
24852446429,4,9,2
46804135891,8,6,4
60956456424,15,2,5
68216911708,10,7,5
43186523025,7,20,7
64403598239,11,18,8
90543168331,33,3,9
19492220327,13,24,10


# Five Segments

In [24]:
# Check all the store types
final.select('Store_type').distinct().toPandas().values.tolist()

[['opticians, optical goods, and eyeglasses'],
 ['watch, clock, and jewelry repair shops'],
 ['computer programming , data processing, and integrated systems design services'],
 ['digital goods: books, movies, music'],
 ['books, periodicals, and newspapers'],
 ['florists supplies, nursery stock, and flowers'],
 ['art dealers and galleries'],
 ['antique shops - sales, repairs, and restoration services'],
 ['gift, card, novelty, and souvenir shops'],
 ['equipment, tool, furniture, and appliance rent al and leasing'],
 ['cable, satellite, and other pay television and radio services'],
 ['tent and awning shops'],
 ['artist supply and craft shops'],
 ['stationery, office supplies and printing and writing paper'],
 ['furniture, home furnishings and equipment shops, and manufacturers, except appliances'],
 ['telecom'],
 ['jewelry, watch, clock, and silverware shops'],
 ['shoe shops'],
 ['hobby, toy and game shops'],
 ['computers, computer peripheral equipment, and software'],
 ['music shops -

### Manually divide store types into 5 categories

In [25]:
art = final.filter((final.Store_type == 'antique shops - sales, repairs, and restoration services') 
                    | (final.Store_type == 'art dealers and galleries')
                    | (final.Store_type == 'watch, clock, and jewelry repair shops')
                    | (final.Store_type == 'artist supply and craft shops')
                    | (final.Store_type == 'jewelry, watch, clock, and silverware shops')
                    | (final.Store_type == 'music shops - musical instruments, pianos, and sheet music'))


In [26]:
sports = final.filter((final.Store_type == 'bicycle shops - sales and service') 
                    | (final.Store_type == 'motor vehicle supplies and new parts')
                    | (final.Store_type == 'tent and awning shops'))

In [27]:

technology = final.filter((final.Store_type == 'cable, satellite, and other pay television and radio services') 
                    | (final.Store_type == 'computer programming , data processing, and integrated systems design services')
                    | (final.Store_type == 'computers, computer peripheral equipment, and software')
                    | (final.Store_type == 'digital goods: books, movies, music')
                    | (final.Store_type == 'equipment, tool, furniture, and appliance rent al and leasing')
                    | (final.Store_type == 'telecom'))

In [28]:

home = final.filter((final.Store_type == 'books, periodicals, and newspapers') 
                    | (final.Store_type == 'florists supplies, nursery stock, and flowers')
                    | (final.Store_type == 'furniture, home furnishings and equipment shops, and manufacturers, except appliances')
                    | (final.Store_type == 'gift, card, novelty, and souvenir shops')
                    | (final.Store_type == 'hobby, toy and game shops')
                    | (final.Store_type == 'lawn and garden supply outlets, including nurseries')
                    | (final.Store_type == 'shoe shops')
                    | (final.Store_type == 'stationery, office supplies and printing and writing paper'))

In [29]:
health = final.filter((final.Store_type == 'health and beauty spas') 
                    | (final.Store_type == 'opticians, optical goods, and eyeglasses'))

### Find top10 merchants in each segment


In [30]:
from pyspark.sql.functions import col
art_entropy_df = get_rank_df(art,col_list_1,col_list_2)
art_study_df =  final_study_score(art)
art_combine = combine_two_models(art_entropy_df, art_study_df)
art_combine.select(art_combine.merchant_abn, art_combine.entropy_rank, art_combine.study_rank, art_combine.final_rank).limit(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])


merchant_abn,entropy_rank,study_rank,final_rank
64403598239,2,8,1
19492220327,4,7,2
68559320474,8,4,3
28057731482,10,2,3
86578477987,1,13,5
90918180829,6,11,6
29616684420,15,3,7
31334588839,9,9,7
63123845164,13,10,9
23338656015,11,12,9


In [31]:
sports_entropy_df = get_rank_df(sports,col_list_1,col_list_2)
sports_study_df =  final_study_score(sports)
sports_combine = combine_two_models(sports_entropy_df, sports_study_df)
sports_combine.select(sports_combine.merchant_abn, sports_combine.entropy_rank, sports_combine.study_rank, sports_combine.final_rank).limit(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])


merchant_abn,entropy_rank,study_rank,final_rank
64203420245,1,1,1
89726005175,3,8,2
49891706470,2,13,3
90568944804,12,4,4
22033359776,8,9,5
75454398468,19,3,6
38700038932,9,16,7
96680767841,4,21,7
31385641294,22,6,9
70033549200,28,2,10


In [32]:
technology_entropy_df = get_rank_df(technology,col_list_1,col_list_2)
technology_study_df =  final_study_score(technology)
technology_combine = combine_two_models(technology_entropy_df, technology_study_df)
technology_combine.select(technology_combine.merchant_abn, technology_combine.entropy_rank, technology_combine.study_rank, technology_combine.final_rank).limit(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])


merchant_abn,entropy_rank,study_rank,final_rank
68216911708,1,1,1
72472909171,3,3,2
77590625261,9,2,3
80518954462,7,5,4
49505931725,4,9,5
21439773999,2,13,6
82368304209,15,6,7
35909341340,5,18,8
45433476494,6,17,8
84703983173,12,14,10


In [33]:
home_entropy_df = get_rank_df(home,col_list_1,col_list_2)
home_study_df =  final_study_score(home)
home_combine = combine_two_models(home_entropy_df, home_study_df)
home_combine.select(home_combine.merchant_abn, home_combine.entropy_rank, home_combine.study_rank, home_combine.final_rank).limit(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])


merchant_abn,entropy_rank,study_rank,final_rank
60956456424,5,1,1
24852446429,1,7,2
43186523025,3,6,3
80324045558,4,5,3
90543168331,13,2,5
79417999332,6,12,6
76767266140,9,13,7
41944909975,16,10,8
38090089066,10,18,9
81219314324,12,16,9


In [34]:
health_entropy_df = get_rank_df(health,col_list_1,col_list_2)
health_study_df =  final_study_score(health)
health_combine = combine_two_models(health_entropy_df, health_study_df)
health_combine.select(health_combine.merchant_abn, health_combine.entropy_rank, health_combine.study_rank, health_combine.final_rank).limit(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reve[['Revenue_levels_raw_score']] = reve[['Revenue_levels']].replace(to_replace = ['a', 'b', 'c', 'd', 'e'], value = [10, 20, 30, 40, 50])


merchant_abn,entropy_rank,study_rank,final_rank
46804135891,1,2,1
18158387243,3,1,2
48534649627,2,8,3
11237511112,6,4,3
71674475255,9,3,5
95574756848,8,14,6
79198689842,18,5,7
11566786699,4,20,8
88547577701,14,11,9
88699453206,7,19,10


In [37]:
pure_add_rank.write.parquet('../data/curated/baseline.parquet')
sorted_self_weight_score.write.parquet('../data/curated/self_weight.parquet')
entropy_score.write.parquet('../data/curated/entropy_weight.parquet')
sum_rank.write.parquet('../data/curated/sum_rank.parquet')
study_score.write.parquet('../data/curated/study_score_rank.parquet')
top_100.write.parquet('../data/curated/top_100.parquet')
art_combine.write.parquet('../data/curated/art_combine.parquet')
sports_combine.write.parquet('../data/curated/sports_combine.parquet')
technology_combine.write.parquet('../data/curated/technology_combine.parquet')
home_combine.write.parquet('../data/curated/home_combine.parquet')
health_combine.write.parquet('../data/curated/health_combine.parquet')

22/10/05 17:24:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/10/05 17:24:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/10/05 17:24:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/10/05 17:24:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/10/05 17:24:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/10/05 17:24:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/10/05 1

22/10/05 19:57:56 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 791226 ms exceeds timeout 120000 ms
22/10/05 19:57:56 WARN SparkContext: Killing executors is not supported by current scheduler.
