In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [3]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/09/16 10:56:54 WARN Utils: Your hostname, Xiaotongs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.8.134 instead (on interface en0)
22/09/16 10:56:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/09/16 10:56:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [34]:
transactions = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot')\
    .union(spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot'))

In [35]:
probs_merchant = spark.read.option('header', True).csv('../data/tables/merchant_fraud_probability.csv')
probs_consumer= spark.read.option('header', True).csv('../data/tables/consumer_fraud_probability.csv')

In [36]:
transactions.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [37]:
probs_consumer.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [38]:
probs_consumer = probs_consumer.withColumn('user_id', F.col('user_id').cast('long'))\
        .withColumn('order_datetime', F.col('order_datetime').cast('date'))\
        .withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [39]:
probs_consumer.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: float (nullable = true)



In [40]:
probs_merchant.printSchema()

root
 |-- merchant_abn: string (nullable = true)
 |-- order_datetime: string (nullable = true)
 |-- fraud_probability: string (nullable = true)



In [41]:
probs_merchant = probs_merchant.withColumn('merchant_abn', F.col('merchant_abn').cast('long'))\
        .withColumn('order_datetime', F.col('order_datetime').cast('date'))\
        .withColumn('fraud_probability', F.col('fraud_probability').cast('float'))

In [42]:
probs_merchant.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: float (nullable = true)



In [43]:
probs_merchant

merchant_abn,order_datetime,fraud_probability
19492220327,2021-11-28,44.40366
31334588839,2021-10-02,42.755302
19492220327,2021-12-22,38.86779
82999039227,2021-12-19,94.1347
90918180829,2021-09-02,43.325516
31334588839,2021-12-26,38.36166
23686790459,2021-12-10,79.454346
14827550074,2021-11-26,46.457756
31334588839,2021-11-26,36.209713
19492220327,2021-12-18,33.81967


In [44]:
result = transactions.join(probs_merchant, on = ['merchant_abn', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'merchant_prob')
result = result.join(probs_consumer, on = ['user_id', 'order_datetime'], how = 'left').withColumnRenamed('fraud_probability', 'consumer_prob')

In [46]:
result = result.na.fill(0)

In [50]:
result = result.withColumn('is_fraud', (F.col('merchant_prob') > 50) | (F.col('consumer_prob') > 50))

In [54]:
print('In {} transactions, {} are detected as fraud'.format(result.count(), result.filter(F.col('is_fraud') == True).count()))


                                                                                

In 8151584 transactions, 1572 are detected as fraud


                                                                                