In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from pyspark.sql.functions import round
from pyspark.sql import Window

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/10/15 15:19:23 WARN Utils: Your hostname, hexiangyideMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.189.29 instead (on interface en0)
22/10/15 15:19:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/15 15:19:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read the tables
transaction = spark.read.parquet('../data/curated/final_transaction.parquet')
merchants = spark.read.parquet('../data/curated/final_merchant.parquet')
consumer = spark.read.parquet('../data/curated/final_consumer.parquet')
census = spark.read.parquet('../data/curated/final_census.parquet')
post_sa2_2021 = spark.read.parquet("../data/external/postcode_sa2_conrrespondences.parquet")
prediction = spark.read.parquet('../data/curated/prediction/predicted_dollar_value0.parquet')
# Read all the prediction data
for counts in range(1,45):
    counts =  str(counts)
    prediction_add = spark.read.parquet(f"../data/curated/prediction/predicted_dollar_value{counts}.parquet")
    prediction = prediction.union(prediction_add)
prediction


                                                                                

merchant_abn,predict_dollar_value
11093673306,69.63705444335938
11121775571,42.397274017333984
11134994590,6068.73291015625
11137507330,101.62303161621094
11149063370,8951.830078125
11173270682,179.31930541992188
11199348915,493.9423522949219
11208869091,747.7931518554688
11215815177,718.6666259765625
11237511112,184.6631317138672


In [4]:
# group the tables, get the number of big order per merchant
grouped_transaction = transaction.groupBy('merchant_abn').agg(
    F.sum('dollar_value').alias('Amount'), F.count('dollar_value').alias('Count'), 
    F.sum('whether_bigorder').alias('count_of_bigorder')).sort('merchant_abn')
grouped_transaction.drop(F.col('order_id'))

                                                                                

merchant_abn,Amount,Count,count_of_bigorder
10023283211,379547.6390727268,2121,0
10142254217,48882.66543336331,1875,0
10165489824,35623.68038836445,3,2
10187291046,20254.487124206804,206,0
10192359162,82759.73332280711,239,0
10206519221,154924.2353959908,6090,0
10255988167,164669.6631307207,531,0
10264435225,296841.2830886513,3219,0
10279061213,85417.56321640752,351,0
10323485998,648209.1174973631,6638,0


In [5]:

# get the monthly average features
number_of_months = 18
grouped_transaction = grouped_transaction.withColumn('Avg_amount_monthly', round(grouped_transaction['Amount']/number_of_months, 2))
grouped_transaction = grouped_transaction.withColumn('Avg_count_monthly', round(grouped_transaction['Count']/number_of_months, 2))
grouped_transaction = grouped_transaction.withColumn('Order_avg_value', round(grouped_transaction.Amount/grouped_transaction.Count,2))
grouped_transaction = grouped_transaction.drop('Amount','Count')
# Add monthly average features into merchants data
merchant_data1 = merchants.join(grouped_transaction, merchants.merchant_abn == grouped_transaction.merchant_abn).drop(grouped_transaction.merchant_abn)
merchant_data1 = merchant_data1.drop('Amount','Count')

In [6]:
# For the transaction data calculate each customer's fraud data
ori_transaction_1 = transaction.groupby('merchant_abn','user_id').agg(
    F.count('user_id').alias('count'), 
    F.avg('average_prob_con').alias('avg_prob_fraud_cus'),
    F.avg('whether_fraud').alias('whether_fraud'))
# Add the fraud data to merchants
o_t = ori_transaction_1.groupby('merchant_abn').agg(
    F.count('user_id').alias('cnt'), 
    F.avg('avg_prob_fraud_cus').alias('avg_prob_fraud_cus'),
    F.sum('whether_fraud').alias('num_of_fraud'))
# Calculate the probability of fraud customers among all customers
cus_per_mon = o_t.withColumn('prob_of_fraud', o_t.num_of_fraud/o_t.cnt)
cus_per_mon = cus_per_mon.withColumn('count_cus_per_mon', round(o_t['cnt']/number_of_months, 2))
cus_per_mon = cus_per_mon.drop('cnt')
cus_per_mon = cus_per_mon.drop('num_of_fraud')
ori_transaction_2 = transaction.groupby('merchant_abn', 'user_id').count()
# Calculate whether he/she is a regular customer
ori_con_drop = ori_transaction_2.withColumn(
    "fixed_cus_num",
    F.when(F.col("count") >= 5, 1).otherwise(0))
# Calculate the number of the regular customer
ori_con_fix = ori_con_drop.groupby('merchant_abn').agg(F.sum('fixed_cus_num').alias('fix_cus_num'))

In [7]:

# Combine the customer information into merchants
user_info = cus_per_mon.join(ori_con_fix, cus_per_mon.merchant_abn == ori_con_fix.merchant_abn).drop(ori_con_fix.merchant_abn)
user_info = user_info.drop('total_cus_num')
merchant_abn_and_consumer_id = transaction['merchant_abn', 'user_id']
user_id_and_postcode = consumer[['postcode','user_id']]
merchant_and_consumer_postcode = merchant_abn_and_consumer_id.join(user_id_and_postcode,['user_id'])
merchant_and_consumer_postcode = merchant_and_consumer_postcode['merchant_abn', 'postcode']

In [8]:
# Guess the merchants' postcode
# https://stackoverflow.com/questions/36654162/mode-of-grouped-data-in-pyspark
counts = merchant_and_consumer_postcode.groupBy(['merchant_abn', 'postcode']).count().alias('counts')
merchant_postcode = (counts
          .groupBy('merchant_abn')
          .agg(F.max(F.struct(F.col('count'),
                              F.col('postcode'))).alias('max'))
          .select(F.col('merchant_abn'), F.col('max.postcode'))
         )
# Add census information into merchants
merchant_info = merchant_data1.join(merchant_postcode, merchant_data1.merchant_abn == merchant_postcode.merchant_abn).drop(merchant_data1.merchant_abn)
post_census = post_sa2_2021.join(census, post_sa2_2021.SA2_CODE_2021 == census.SA2_CODE_2021).drop(census.SA2_CODE_2021)
post_census_1 = post_census.groupBy('postcode')\
.agg(F.avg(F.col('income_percentage')).alias("avg_income_percentage"),
F.avg(F.col('age_percentage')).alias('avg_age_percentage'))
post_census_2 = post_census_1.select(post_census_1.postcode.cast('int'),post_census_1.avg_income_percentage, post_census_1.avg_age_percentage)

In [9]:
semifinal = merchant_info.join(user_info, merchant_info.merchant_abn == user_info.merchant_abn).drop(merchant_info.merchant_abn)
final = semifinal.join(post_census_2, semifinal.postcode == post_census_2.postcode, 'left').drop(post_census_2.postcode)
final = final.na.fill(value=0,subset=["avg_income_percentage", "avg_age_percentage"])
# Change take rate into predicted revenue for the BNPL companies
final = final.withColumn('Take_rate', final.Take_rate*final.Avg_amount_monthly)
final = final.join(prediction, final.merchant_abn == prediction.merchant_abn).drop(prediction.merchant_abn)
final.write.parquet('../data/curated/merchant_info.parquet')

[Stage 62:>   (0 + 8) / 9][Stage 63:>   (0 + 0) / 9][Stage 64:>   (0 + 0) / 9]9]

22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:51 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 62:===>(8 + 1) / 9][Stage 63:>   (0 + 7) / 9][Stage 64:>   (0 + 0) / 9]

22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on

[Stage 63:>   (0 + 8) / 9][Stage 64:>   (0 + 0) / 9][Stage 65:>   (0 + 0) / 1]

22/10/15 15:19:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 64:>   (0 + 8) / 9][Stage 65:>   (0 + 0) / 1][Stage 66:> (0 + 0) / 180]

22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:01 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                

22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:05 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 68:>   (0 + 8) / 9][Stage 70:>   (0 + 0) / 8][Stage 72:>   (0 + 0) / 1]

22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 70:>   (0 + 8) / 8][Stage 72:>   (0 + 0) / 1][Stage 74:>   (0 + 0) / 9]

22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/15 15:20:08 WARN RowBasedKeyValueBatch: Calling spill() on

                                                                                