In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import requests
import os

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Modelling 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

## Read dataset

In [None]:
# Information on consumers
consumer_is_fraud = spark.read.parquet("../data/curated/all_details")

In [None]:
consumer_is_fraud.head()


In [None]:
all_features = ["order_id", "user_id", "merchant_abn", "order_datetime", 
                "dollar_value", "postcode", "merchant_fraud", "consumer_fraud", 
                "weekly_personal_disposable", "median_total_household_income", 
                "median_total_family_income", "is_po_box", 
                "avg_household_size", "sa2_name", "median_age", "sa2_code"]

In [None]:

from pyspark.sql.functions import monotonically_increasing_id

In [None]:
consumer_is_fraud.count()

In [None]:
consumer_is_fraud.filter(F.col('consumer_fraud') > 0).count()

In [None]:
positive_consumer_is_fraud = consumer_is_fraud.filter(F.col('consumer_fraud') > 0)

## Feature selection

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Correlation analysis

# Check numerical cols
numerical_columns = ['user_id', 'merchant_abn', 'dollar_value', 'merchant_fraud', 'consumer_fraud', 
                     'weekly_personal_disposable', 'median_total_household_income', 
                     'median_total_family_income', 'avg_household_size', 'median_age']

# Calculate the correlation matrix for the numerical columns
correlations = {}
for col1 in numerical_columns:
    correlations[col1] = []
    for col2 in numerical_columns:
        corr_value = consumer_is_fraud.corr(col1, col2)
        correlations[col1].append(corr_value)

# Convert the correlation dictionary to a Pandas DataFrame for visualization
corr_df = pd.DataFrame(correlations, index=numerical_columns)

# Plot the heatmap using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Initial Consumer Fraud Features Correlation Heatmap')
plt.savefig("../plots/Initial Consumer Fraud Features Correlation Heatmap.png")
plt.show()

Since the consumer fraud field is 0-inflated, the numerical features have little correlation.

Analyze non-zero part's correlation is more meaningful.

In [None]:
# Correlation analysis

# Check numrical cols
numerical_columns = ['user_id', 'merchant_abn', 'dollar_value', 'merchant_fraud', 'consumer_fraud', 
                     'weekly_personal_disposable', 'median_total_household_income', 
                     'median_total_family_income', 'avg_household_size', 'median_age']

# Calculate the correlation matrix for the numerical columns
correlations = {}
for col1 in numerical_columns:
    correlations[col1] = []
    for col2 in numerical_columns:
        corr_value = positive_consumer_is_fraud.corr(col1, col2) # non-zero consumer fraud
        correlations[col1].append(corr_value)

# Convert the correlation dictionary to a Pandas DataFrame for visualization
corr_df = pd.DataFrame(correlations, index=numerical_columns)

# Plot the heatmap using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Positive Consumer Fraud Features Correlation Heatmap')
plt.savefig("../plots/Positive Consumer Fraud Features Correlation Heatmap.png")
plt.show()

'weekly_personal_disposable', 'median_total_household_income', 'median_total_family_income', 'avg_household_size' are highly correlated. Keep 'median_total_household_income' only
Although merchant fraud displays relatively higher correlation with consumer fraud, to avoid dependency between features of the final model, removing it from the input features.

In [None]:

# select the useful features

useful_num_features = ['order_id', 'user_id', 'dollar_value', 'median_total_household_income', 'median_age']
target_feature = ['consumer_fraud']
selected_features = useful_num_features + target_feature

We want to check the correlation between each attribute using a subsample dataset. The subsample dataset contains 30% of the 2022 data and 70% of the 2021 data as we will train model on the data in 2021 and test it on the data in 2022.

Because pyspark ml can't handle Null values, we drop external attributes that contain Null values.

In [None]:
impute_consumer_fraud_sdf = consumer_is_fraud.select(selected_features)
impute_consumer_fraud_sdf.limit(5)

In [None]:
impute_consumer_fraud_sdf.count()

In [None]:
impute_consumer_fraud_sdf.filter(F.col("median_total_household_income").isNotNull()).count()

As we expected, there are some consumer not existing in transaction dataset, we will remove those instances.

In [None]:


impute_consumer_fraud_sdf = impute_consumer_fraud_sdf.filter(F.col("median_total_household_income").isNotNull())

Let's check the number of instances with non-missing fraud probability.

In [None]:
impute_consumer_fraud_sdf.filter(F.col('consumer_fraud') > 0).count()

We would like to use these instances to build a imputation model for the remaining null values.

In [None]:
model_df = impute_consumer_fraud_sdf.filter(F.col('consumer_fraud') > 0)

In [None]:
final_df = model_df.dropna('any')
final_df.count()

In [None]:
final_df.limit(5)

In [None]:
# Save data to file
final_df.write.mode('overwrite').parquet('../data/curated/modelling_consumer_fraud/')

In [None]:
model_sdf = spark.read.parquet('../data/curated/modelling_consumer_fraud')

## Train Random Forest regression

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = model_sdf.randomSplit([0.8, 0.2], seed=42)

In [None]:
print(trainingData.count())
print(testData.count())

### Baseline Model

The model only predicts the mean value for all test instances.

In [None]:
mean_value = testData.agg(F.mean("consumer_fraud")).collect()[0][0]
mean_predictions_df = testData.select("consumer_fraud").withColumn("prediction", F.lit(mean_value))

mse_evaluator = RegressionEvaluator(labelCol="consumer_fraud", predictionCol="prediction", metricName="mse")
mae_evaluator = RegressionEvaluator(labelCol="consumer_fraud", predictionCol="prediction", metricName="mae")

# Get the score
mse = mse_evaluator.evaluate(mean_predictions_df)
mae = mae_evaluator.evaluate(mean_predictions_df)

print(f"Mean Square Error: {mse}")
print(f"Mean Absolute Error: {mae}")

### Random Forest

We aim to create a simple model, therefore, the model hyperparameters were chosen arbitrarily.

In [None]:
from pyspark.ml import Pipeline

In [None]:
features = 'features'
input_cols = trainingData.drop('consumer_fraud').drop('order_id').drop('user_id').columns

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

In [None]:
rft = RandomForestRegressor(
    featuresCol='features', 
    labelCol='consumer_fraud',
    numTrees=5, 
    maxDepth=5,
    seed=42
)

In [None]:
pipeline = Pipeline(stages=[assembler, rft])
model = pipeline.fit(trainingData)

- Make predictions.

In [None]:
predictions = model.transform(testData)

In [None]:
predictions.head(5)

- Evaluate the predictions.

In [None]:
mse_evaluator = RegressionEvaluator(labelCol="consumer_fraud", predictionCol="prediction", metricName="mse")
mae_evaluator = RegressionEvaluator(labelCol="consumer_fraud", predictionCol="prediction", metricName="mae")

# Get the score
mse = mse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)

print(f"Mean Square Error: {mse}")
print(f"Mean Absolute Error: {mae}")


In [None]:
result_pd = predictions.select("prediction", "consumer_fraud").toPandas()
plt.scatter(result_pd.consumer_fraud, result_pd.prediction)

### Imputation for consumer fraud probability

In [None]:
imputation_data = impute_consumer_fraud_sdf.filter(F.col('consumer_fraud') == 0).drop('consumer_fraud')

In [None]:
imputed_sdf = model.transform(imputation_data)

In [None]:
imputed_sdf.select("prediction").describe()

We merge imputed consumer fraud probability to the main dataset.

In [None]:
# data with given consumer fraud probability
non_missing_consumer_fraud = model_sdf.select(["order_id", "consumer_fraud"])
imputed_sdf_select = imputed_sdf.select(["order_id", "prediction"]).withColumnRenamed("prediction", "consumer_fraud")

# Combine two dataset
full_consumer_fraud = imputed_sdf_select.union(non_missing_consumer_fraud)

In [None]:
# Save data to file
full_consumer_fraud.write.mode('overwrite').parquet('../data/curated/predicted_consumer_fraud/')

In [None]:
full_consumer_fraud.count()

In [None]:
test = spark.read.parquet('../data/curated/predicted_consumer_fraud/')
test.count()