In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window
import numpy as np
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [24]:
merchant_study_score = spark.read.parquet('../data/curated/merchant_info.parquet')
merchant_pd = merchant_study_score.toPandas()

Unnamed: 0,Store_type,Revenue_levels,Take_rate,count_of_bigorder,Avg_amount_monthly,Avg_count_monthly,Order_avg_value,postcode,avg_prob_fraud_cus,num_of_fraud,count_cus_per_mon,merchant_abn,fix_cus_num,avg_income_percentage,avg_age_percentage
0,"furniture, home furnishings and equipment shop...",e,0.18,0,28650.29,159.94,179.13,5582,13.103379,8.0,150.11,10023283211,0,0.146046,0.488653
1,"cable, satellite, and other pay television and...",b,4.22,0,3741.36,143.5,26.07,6438,13.235562,12.0,135.56,10142254217,0,0.304018,0.73751


In [81]:
from pyspark.sql.functions import col
import scipy.stats as st
import pandas as pd

def convert_to_score(table_name, col_name):
    sorted_col = table_name[['merchant_abn', col_name]].sort_values(by=col_name, ascending=False)
    num_of_merchant = len(sorted_col)
    rank_list = [i for i in range(1, num_of_merchant+1)]
    sorted_col['rank'] = rank_list
    sorted_col['percentage above the merchants'] = 1-sorted_col['rank']/(num_of_merchant+1)
    sorted_col['z score'] = st.norm.ppf(sorted_col['percentage above the merchants'])
    sorted_col[col_name+'_raw_score'] = sorted_col['z score']*5.75+30
    return sorted_col[['merchant_abn', col_name+'_raw_score']]

numeric_features_list = ['count_of_bigorder', 'Avg_amount_monthly', 'Avg_count_monthly', 'Order_avg_value', 
    'avg_prob_fraud_cus', 'num_of_fraud', 'count_cus_per_mon', 'fix_cus_num', 'avg_income_percentage', 
    'avg_age_percentage']

merchant_pd[['num_of_fraud']] = -abs(merchant_pd[['num_of_fraud']])
merchant_pd[['avg_prob_fraud_cus']] = -abs(merchant_pd[['avg_prob_fraud_cus']])

raw_score_df = merchant_pd[['merchant_abn']]
for feature in numeric_features_list:
    feature_raw_score = convert_to_score(merchant_pd, feature)
    raw_score_df = pd.merge(raw_score_df, feature_raw_score, how='inner', on = 'merchant_abn')


#convert_to_score(merchant_pd, 'count_of_bigorder')
#raw_score_df[['raw_score_sum']] = sum[raw_score_df[1:]]
raw_score_df['raw_score_sum']= raw_score_df.iloc[:, 1:-1].sum(axis=1)
raw_score_df[['merchant_abn', 'raw_score_sum']].sort_values(by='raw_score_sum', ascending=False).head(20)


Unnamed: 0,merchant_abn,raw_score_sum
2439,64203420245,328.549793
2293,60956456424,324.748775
2698,70009327857,320.583216
2607,67978471888,319.177144
2149,57223200264,318.743034
3997,99291944648,318.519352
2446,64403598239,317.543847
3613,90543168331,316.09437
2386,63123845164,315.877854
2131,56779111060,315.299173


In [76]:
def convert_to_score(table_name, col_name):
    sorted_col = table_name[['merchant_abn', col_name]].sort_values(by=col_name, ascending=False)
    num_of_merchant = len(sorted_col)
    rank_list = [i for i in range(1, num_of_merchant+1)]
    sorted_col['rank'] = rank_list
    sorted_col['percentage above the merchants'] = 1-sorted_col['rank']/(num_of_merchant+1)
    sorted_col['z score'] = st.norm.ppf(sorted_col['percentage above the merchants'])
    sorted_col[col_name+'_raw_score'] = sorted_col['z score']*7+30
    return sorted_col[['merchant_abn', col_name+'_raw_score']]

convert_to_score(merchant_pd, 'Avg_amount_monthly')

Unnamed: 0,merchant_abn,Avg_amount_monthly_final_score
833,28057731482,54.377906
2446,64403598239,53.046933
2293,60956456424,52.236447
2607,67978471888,51.645608
1802,49322182190,51.177635
...,...,...
1905,51561881468,8.822365
1926,52351039440,8.354392
1289,37802138328,7.763553
4012,99785979138,6.953067
