## Fraud

In [None]:
import pandas as pd
import numpy as np
import os
import re

# # Set working directory
# if not "/data/tables" in os.getcwd():
#     os.chdir("../data/tables")

from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

### Remove Fraud data from the transaction set
since fraud data is only a small subset of the data, consider remove all transaction entries with fraud probability


In [119]:
# convert merchant fraud, consumer fraud to spark df
merchant_fraud_df = pd.read_csv("../data/tables/merchant_fraud_probability.csv")
merchant_fraud_df.to_parquet("../data/temp/merchant_fraud")
merchant_fraud_sdf = spark.read.parquet("../data/temp/merchant_fraud")

consumer_fraud_df = pd.read_csv("../data/tables/consumer_fraud_probability.csv")
consumer_fraud_df.to_parquet("../data/temp/consumer_fraud")
consumer_fraud_sdf = spark.read.parquet("../data/temp/consumer_fraud")

In [120]:
# save all transaction data to a spark dataframe: transaction
transaction_sdf1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot")
transaction_sdf2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot")
transaction_sdf3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot")
transaction = transaction_sdf1.union(transaction_sdf2).union(transaction_sdf3)

In [121]:
# find all transaction on merchant fraud day
merchant_fraud_sdf = spark.read.parquet("../data/temp/merchant_fraud")
merchant_transaction_on_fraud_day = transaction.join(merchant_fraud_sdf.select(["merchant_abn","order_datetime"]), on=["merchant_abn","order_datetime"])

In [124]:
transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [125]:
merchant_transaction_on_fraud_day.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- user_id: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)



In [122]:
# find all transaction on consumer fraud day
consumer_fraud_sdf = spark.read.parquet("../data/temp/consumer_fraud")
consumer_transaction_on_fraud_day = transaction.join(consumer_fraud_sdf.select(["user_id","order_datetime"]), on=["user_id","order_datetime"])

In [129]:
# filter fraud transactions
order = ["user_id","merchant_abn","dollar_value","order_id","order_datetime"]
transaction_fraud_rm = transaction.subtract(merchant_transaction_on_fraud_day.select(order)).subtract(consumer_transaction_on_fraud_day.select(order))

transaction_fraud_rm can be used for further analysis

### Derive fraud rate and other features

#### Definition:
