In [1]:
from pyspark.sql import functions as F, SparkSession
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType, DoubleType
from urllib.request import urlretrieve

import os
os.sys.path.append("../")
from scripts.preliminary_analysis import *


In [2]:
spark = (
    SparkSession.builder.appName("Merchant Fraud Model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

24/09/16 20:22:44 WARN Utils: Your hostname, Alistairs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.20.10.4 instead (on interface en0)
24/09/16 20:22:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/16 20:22:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
url = "https://www.abs.gov.au/statistics/people/people-and-communities/snapshot-australia/2021/Snapshot%20of%20Australia%20data%20summary.xlsx"


# urlretrieve(url, "test.xlsx")

In [4]:
import requests

In [5]:
with open('data.csv', 'w') as f_out:
    f_out.write(requests.get(url).text)

In [6]:
# ad = spark.read.parquet()

## Read in the necessary curated data

In [7]:
path = "../data/curated"

# Read merchant datasets in
merchant_info = spark.read.parquet(f"{path}/merchant_info.parquet")
merchant_fp = spark.read.parquet(f"{path}/merchant_fraud_prob.parquet")

# Read in transactions dataset
transactions = spark.read.parquet(f"{path}/transactions.parquet")

                                                                                

## Join the necessary datasets

In [8]:
# Check initial dataset size
print("Original dataset:")
get_dataset_count(transactions)
print("\n")

# Join transaction records with merchant fraud data
transaction_records_with_fraud = transactions.join(
    merchant_fp, 
    on=["merchant_abn", "order_datetime"], 
    how="left"
)

print("After merchant_fp join:")

# See how the dataset size changes along the way
get_dataset_count(transaction_records_with_fraud)

# Preview
transaction_records_with_fraud.show(5)

# Check if the join led to any missing values.
calculate_missing_values(transaction_records_with_fraud)

# Join transaction records with merchant info
transaction_records_final = transaction_records_with_fraud.join(
    merchant_info, 
    on="merchant_abn", 
    how="inner"
)

print("After merchant_info join:")

# See how the dataset size changes along the way
get_dataset_count(transaction_records_final)

# Preview
transaction_records_final.show(5)

# See how the dataset size changes along the way
# Check if the join led to any missing values.
calculate_missing_values(transaction_records_final)

Original dataset:
The dataset count is  12561377


After merchant_fp join:


                                                                                

The dataset count is  12561377
+------------+--------------+------------------+--------------------+-----------+-----------------+
|merchant_abn|order_datetime|      dollar_value|            order_id|consumer_id|fraud_probability|
+------------+--------------+------------------+--------------------+-----------+-----------------+
| 79417999332|    2021-11-26|136.06570809815838|23acbb7b-cf98-458...|    1059280|             NULL|
| 46451548968|    2021-11-26| 72.61581642788431|76bab304-fa2d-400...|    1195503|             NULL|
| 89518629617|    2021-11-26|3.0783487174439297|a2ae446a-2959-41c...|     986886|             NULL|
| 49167531725|    2021-11-26| 51.58228625503599|7080c274-17f7-4cc...|    1195503|             NULL|
| 31101120643|    2021-11-26|25.228114942417797|8e301c0f-06ab-45c...|     986886|             NULL|
+------------+--------------+------------------+--------------------+-----------+-----------------+
only showing top 5 rows



                                                                                

+--------------------------+----------------------------+--------------------------+----------------------+-------------------------+-------------------------------+
|merchant_abn_missing_count|order_datetime_missing_count|dollar_value_missing_count|order_id_missing_count|consumer_id_missing_count|fraud_probability_missing_count|
+--------------------------+----------------------------+--------------------------+----------------------+-------------------------+-------------------------------+
|                         0|                           0|                         0|                     0|                        0|                       12557318|
+--------------------------+----------------------------+--------------------------+----------------------+-------------------------+-------------------------------+

After merchant_info join:


                                                                                

The dataset count is  12047317
+------------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+
|merchant_abn|order_datetime|      dollar_value|            order_id|consumer_id|fraud_probability|                name|            category|revenue_level|take_rate|
+------------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+
| 79417999332|    2021-11-26|136.06570809815838|23acbb7b-cf98-458...|    1059280|             NULL|Phasellus At Company|gift, card, novel...|            b|     4.95|
| 46451548968|    2021-11-26| 72.61581642788431|76bab304-fa2d-400...|    1195503|             NULL|Tempus Eu Ligula ...|health and beauty...|            a|     6.04|
| 89518629617|    2021-11-26|3.0783487174439297|a2ae446a-2959-41c...|     986886|             NULL|Vulputate Velit E...|tent  and awning ..



+--------------------------+----------------------------+--------------------------+----------------------+-------------------------+-------------------------------+------------------+----------------------+---------------------------+-----------------------+
|merchant_abn_missing_count|order_datetime_missing_count|dollar_value_missing_count|order_id_missing_count|consumer_id_missing_count|fraud_probability_missing_count|name_missing_count|category_missing_count|revenue_level_missing_count|take_rate_missing_count|
+--------------------------+----------------------------+--------------------------+----------------------+-------------------------+-------------------------------+------------------+----------------------+---------------------------+-----------------------+
|                         0|                           0|                         0|                     0|                        0|                       12043314|                 0|                     0|             

                                                                                

## Feature engineering

In [9]:
# Flag unusual transactions that deviate greatly from a merchant's usual dollar value

# Calculate average and standard deviation of dollar_value per merchant
transaction_stats = transaction_records_final.groupBy("merchant_abn").agg(
    F.avg("dollar_value").alias("avg_dollar_value"),
    F.stddev("dollar_value").alias("stddev_dollar_value")
)

# Join the stats back to the original dataset
transaction_records_final = transaction_records_final.join(transaction_stats, on="merchant_abn", how="left")

# Calculate how many standard deviations away each transaction is
# May need extra caution to interpret this feature as it can be POSITIVE OR NEGATIVE
transaction_records_final = transaction_records_final.withColumn(
    "std_diff_dollar_value", 
    F.when(
        F.col("stddev_dollar_value").isNotNull() & (F.col("stddev_dollar_value") != 0), 
        (F.col("dollar_value") - F.col("avg_dollar_value")) / F.col("stddev_dollar_value")
    ).otherwise(0) 
)

transaction_records_final.show(5)



+------------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+
|merchant_abn|order_datetime|      dollar_value|            order_id|consumer_id|fraud_probability|                name|            category|revenue_level|take_rate|  avg_dollar_value|stddev_dollar_value|std_diff_dollar_value|
+------------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+
| 31101120643|    2021-11-26|25.228114942417797|8e301c0f-06ab-45c...|     986886|             NULL|Commodo Hendrerit...|cable, satellite,...|            a|     6.37| 78.38360490200445|  55.36202620829845|  -0.9601435063014176|
| 79417999332|    2021-11-26|136.06570809815838|23acbb7b-cf98-458...|    1059280|           

                                                                                

The purpose of this feature is to take into account how much a transaction's dollar value deviates from what is usual for that merchant, in order to flag unusual transaction amounts.

"stddev_dollar_value" -> Accounts for magnitude of difference of dollar value against the average, while also adjusted for each merchant's typical variability. 
This is important because for example, while a $100 difference may be normal for one merchant but very unusual for another merchant.

Consideration: how it could be positive or negative values for this feature column

In [10]:
# Flag unusual monthly transaction volumes that deviate from a merchant's usual monthly volume

# Extract month and year from order_datetime
transaction_records_final = transaction_records_final.withColumn("order_month", F.date_format(F.col("order_datetime"), "yyyy-MM"))

# Calculate number of transactions per merchant per month
transaction_records_monthly = transaction_records_final.groupBy("merchant_abn", "order_month").agg(
    F.count("order_id").alias("monthly_order_volume")
)

# Calculate the average standard deviation of monthly transactions per merchant
transaction_stats = transaction_records_monthly.groupBy("merchant_abn").agg(
    F.avg("monthly_order_volume").alias("avg_monthly_order_volume"),
    F.stddev("monthly_order_volume").alias("stddev_monthly_order_volume")
)

# Join the monthly volume feature back with the original dataset
transaction_records_final = transaction_records_final.join(transaction_records_monthly, on=["merchant_abn", "order_month"], how="left"
)

# Join the transaction statistics back to the original dataset 
transaction_records_final = transaction_records_final.join(transaction_stats, on="merchant_abn", how="left")

# Calculate how many standard deviations away each monthly volume is
transaction_records_final = transaction_records_final.withColumn(
    "std_diff_order_volume", 
    F.when(F.col("stddev_monthly_order_volume").isNotNull() & (F.col("stddev_monthly_order_volume") != 0),
           (F.col("monthly_order_volume") - F.col("avg_monthly_order_volume")) / F.col("stddev_monthly_order_volume"))
    .otherwise(0)
)

transaction_records_final.show(5)



+------------+-----------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+--------------------+------------------------+---------------------------+---------------------+
|merchant_abn|order_month|order_datetime|      dollar_value|            order_id|consumer_id|fraud_probability|                name|            category|revenue_level|take_rate|  avg_dollar_value|stddev_dollar_value|std_diff_dollar_value|monthly_order_volume|avg_monthly_order_volume|stddev_monthly_order_volume|std_diff_order_volume|
+------------+-----------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+--------------------+------------------------+---------------------------+------------------

                                                                                

In [11]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Index the revenue_level column
indexer = StringIndexer(inputCol="revenue_level", outputCol="revenue_level_index")

# One-hot encode the indexed revenue_level
encoder = OneHotEncoder(inputCol="revenue_level_index", outputCol="revenue_level_vec")

# Create and fit the pipeline
pipeline = Pipeline(stages=[indexer, encoder])
transaction_records_final = pipeline.fit(transaction_records_final).transform(transaction_records_final)

transaction_records_final.show(5)



+------------+-----------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+--------------------+------------------------+---------------------------+---------------------+-------------------+-----------------+
|merchant_abn|order_month|order_datetime|      dollar_value|            order_id|consumer_id|fraud_probability|                name|            category|revenue_level|take_rate|  avg_dollar_value|stddev_dollar_value|std_diff_dollar_value|monthly_order_volume|avg_monthly_order_volume|stddev_monthly_order_volume|std_diff_order_volume|revenue_level_index|revenue_level_vec|
+------------+-----------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+----------------

                                                                                

In [13]:
transaction_records_final.show(5)



+------------+-----------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+--------------------+------------------------+---------------------------+---------------------+-------------------+-----------------+
|merchant_abn|order_month|order_datetime|      dollar_value|            order_id|consumer_id|fraud_probability|                name|            category|revenue_level|take_rate|  avg_dollar_value|stddev_dollar_value|std_diff_dollar_value|monthly_order_volume|avg_monthly_order_volume|stddev_monthly_order_volume|std_diff_order_volume|revenue_level_index|revenue_level_vec|
+------------+-----------+--------------+------------------+--------------------+-----------+-----------------+--------------------+--------------------+-------------+---------+------------------+-------------------+---------------------+----------------

                                                                                

# Standardisation

In [15]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

# Assemble the features to scale
numeric_features = ["dollar_value", "monthly_order_volume"]
assembler = VectorAssembler(inputCols=numeric_features, outputCol="numeric_features_vec")

# Apply the assembler to combine the numeric features into a vector
data_with_numeric_vec = assembler.transform(transaction_records_final)

# Standardize the numeric features
scaler = StandardScaler(inputCol="numeric_features_vec", outputCol="scaled_numeric_features", withMean=True, withStd=True)
scaler_model = scaler.fit(data_with_numeric_vec)
transaction_records_scaled = scaler_model.transform(data_with_numeric_vec)

                                                                                

# Modelling

In [None]:
from pyspark.ml.feature import VectorAssembler

# List of features to be used in the model
features = [
    "scaled_numeric_features",
    "monthly_order_volume", 
    "std_diff_order_volume", 
    "revenue_level_vec"
]

# Select only the necessary columns: the features and the target column
relevant_columns = features + ["fraud_probability"]

# Filter the dataset to keep only the relevant columns and remove rows with NULL in fraud_probability
transaction_records_filtered = transaction_records_scaled.select(relevant_columns).filter(F.col("fraud_probability").isNotNull())

# VectorAssembler to combine the features into a single vector
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Prepare the data
data = assembler.transform(transaction_records_filtered)

train_data, test_data = data.randomSplit([0.8, 0.2])

# Preview
train_data.select("features").show(5)

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, LinearRegression

# Define model regressor
dt = DecisionTreeRegressor(labelCol="fraud_probability", featuresCol="features")

rf = RandomForestRegressor(labelCol="fraud_probability", featuresCol="features")

lr = LinearRegression(labelCol="fraud_probability", featuresCol="features")

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Parameter grid
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 7]) \
    .addGrid(dt.maxBins, [32, 64]) \
    .build()

rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 7]) \
    .build()

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluator for regression models
rmse_evaluator = RegressionEvaluator(
    labelCol="fraud_probability", 
    predictionCol="prediction", 
    metricName="rmse"  
)

r2_evaluator = RegressionEvaluator(
    labelCol="fraud_probability",
    predictionCol="prediction",
    metricName="r2" 
)

# Cross-validation 
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

In [None]:
# Pipeline 
dt_pipeline = Pipeline(stages=[dt_cv])

rf_pipeline = Pipeline(stages=[rf_cv])

lr_pipeline = Pipeline(stages=[lr_cv])

In [None]:
# Fit model

# _ mins
dt_model = dt_pipeline.fit(train_data)

# _ mins
rf_model = rf_pipeline.fit(train_data)

# _ mins
lr_model = lr_pipeline.fit(train_data)

## Evaluation/Error Analysis

In [None]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
lr_predictions = lr_model.transform(test_data)


dt_rmse = rmse_evaluator.evaluate(dt_predictions)
dt_r2 = r2_evaluator.evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Decision Tree R2: {dt_r2}")          

rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")           

lr_rmse = rmse_evaluator.evaluate(lr_predictions)
lr_r2 = r2_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Linear Regression R2: {lr_r2}") 

In [None]:
best_dt_model = dt_model.stages[-1].bestModel
print(f"Best Decision Tree maxDepth: {best_dt_model._java_obj.getMaxDepth()}")
print(f"Best Decision Tree maxBins: {best_dt_model._java_obj.getMaxBins()}")


best_rf_model = rf_model.stages[-1].bestModel
print(f"Best Random Forest numTrees: {best_rf_model.getNumTrees}")
print(f"Best Random Forest maxDepth: {best_rf_model.getMaxDepth()}")

## Feature importances

In [None]:
import pandas as pd

# Get the feature names from the VectorAssembler
feature_names = assembler.getInputCols()

best_rf_model = rf_model.stages[0].bestModel
best_dt_model = dt_model.stages[0].bestModel

dt_feature_importances = best_dt_model.featureImportances
rf_feature_importances = best_rf_model.featureImportances

rf_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": rf_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

dt_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": dt_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

print(rf_importances_df)
print()
print(dt_importances_df)

In [None]:
# Get coefficients 
best_lr_model = lr_model.stages[0].bestModel
coefficients = best_lr_model.coefficients

# Get feature names from the VectorAssembler
lr_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print(lr_importances_df)