# Modelling: merchant fraud
In this notebook, we develop models to impute the merchant fraud rate based on current transactions and consumer ata (full dataset).

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import requests
import os

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Data Modelling 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "10g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

## Read dataset

In [None]:
# Information on merchants
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")

# Information on merchant's fraud probability
merchant_fraud_prob = pd.read_csv("../data/tables/part_1/merchant_fraud_probability.csv")

### Preprocessing `goods` feature

In [None]:
from pyspark.ml.feature import CountVectorizer, PCA, StopWordsRemover
from pyspark.sql.functions import lower, regexp_replace
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import monotonically_increasing_id

In [None]:
goods = merchant.select("goods")

In [None]:
clean_goods = goods.withColumn("str_goods", lower(regexp_replace("goods", "[^\w\s]", "")))

tokenizer = Tokenizer(inputCol="str_goods", outputCol="tokens")
clean_goods = tokenizer.transform(clean_goods)

remover = StopWordsRemover(inputCol="tokens", outputCol="clean_goods")
clean_goods = remover.transform(clean_goods)
clean_goods = clean_goods.select('clean_goods')

In [None]:
count_vectorizer = CountVectorizer(inputCol="clean_goods", outputCol="features")

model = count_vectorizer.fit(clean_goods)
result = model.transform(clean_goods)

In [None]:
result.limit(4)

In [None]:
pca = PCA(k=12, inputCol="features")
pca.setOutputCol("pca_features")

model = pca.fit(result)
pca_result = model.transform(result)

In [None]:
model.transform(result).collect()[0].pca_features

In [None]:
model.explainedVariance.sum()

In [None]:
pca_result = pca_result.withColumn("id", monotonically_increasing_id())
merchant = merchant.withColumn("id", monotonically_increasing_id())

In [None]:
merchant = merchant.join(pca_result.select('pca_features', 'id'), on='id', how='inner')
merchant = merchant.drop("id")

In [None]:
from pyspark.sql.functions import when

In [None]:
# Define the ordered categories
ordered_categories = ['a', 'b', 'c', 'd', 'e']

# Map the categories to their corresponding ordinal codes
merchant = merchant.withColumn(
    "ordinal_revenue_level",
    when(F.col("revenue_level") == "a", 0)
    .when(F.col("revenue_level") == "b", 1)
    .when(F.col("revenue_level") == "c", 2)
    .when(F.col("revenue_level") == "d", 3)
    .when(F.col("revenue_level") == "e", 4)
)

merchant = merchant.drop("revenue_level")

### Join transaction dataset

In [None]:
temp = spark.read.parquet('../data/curated/all_details/')
temp.limit(5)

In [None]:
full_dataset = temp.join(merchant, on='merchant_abn', how='left')
full_dataset.count()

In [None]:
# Select useful columns
merchant_is_fraud_model = full_dataset.select(["order_id", "merchant_abn", "user_id", "order_datetime", "dollar_value", "postcode", "merchant_fraud", "weekly_personal_disposable",
                     "median_total_household_income", "median_total_family_income", "avg_household_size", "median_age", "is_po_box", "ordinal_revenue_level" ,"take_rate", "pca_features"])

We check the number of Null values in merchant fraud probability.

In [None]:
merchant_is_fraud_model.filter(F.col('merchant_fraud') > 0).count()

In [None]:
merchant_is_fraud_model.limit(5)

## Feature selection

We want to check the correlation between each attribute using a subsample dataset. The subsample dataset contains 30% of the 2022 data and 70% of the 2021 data as we will train model on the data in 2021 and test it on the data in 2022.

In [None]:
from pyspark.sql.functions import year
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sample_merchant_fraud_df = merchant_is_fraud_model.sample(0.01, seed=42).toPandas()

In [None]:
num_cols = [col for col in sample_merchant_fraud_df.columns if sample_merchant_fraud_df[col].dtype != 'object']

plt.figure(figsize=(10, 10))
sns.heatmap(sample_merchant_fraud_df.loc[sample_merchant_fraud_df['merchant_fraud'] > 0][num_cols].corr(), annot=True, fmt='.2f')

Because pyspark ml can't handle Null values, we drop external attributes that contain Null values.

In [None]:
selected_features = ["order_id", "merchant_abn", "dollar_value", "merchant_fraud", "ordinal_revenue_level" ,"take_rate", "order_datetime", "pca_features"]
impute_merchant_fraud_sdf = merchant_is_fraud_model.select(selected_features)
impute_merchant_fraud_sdf.limit(5)

In [None]:
impute_merchant_fraud_sdf.count()

In [None]:
impute_merchant_fraud_sdf.filter(F.col("take_rate").isNotNull()).count()

As we expected, there are some merchants not existing in transaction dataset, we will remove those instances.

In [None]:
impute_merchant_fraud_sdf = impute_merchant_fraud_sdf.filter(F.col("take_rate").isNotNull())

Let's check the number of instances with non-missing fraud probability.

In [None]:
impute_merchant_fraud_sdf.filter(F.col('merchant_fraud') > 0).count()

We would like to use these instances to build a imputation model for the remaining null values.

In [None]:
model_df = impute_merchant_fraud_sdf.filter(F.col('merchant_fraud') > 0)

In [None]:
final_df = model_df.dropna('any')
final_df.count()

In [None]:
final_df.limit(5)

In [None]:
# Save data to file
final_df.write.mode('overwrite').parquet('../data/curated/modelling/')

In [None]:
model_sdf = spark.read.parquet('../data/curated/modelling')

## Train Random Forest regression

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = model_sdf.randomSplit([0.8, 0.2], seed=42)

In [None]:
print(trainingData.count())
print(testData.count())

### Baseline Model

The model only predicts the mean value for all test instances.

In [None]:
mean_value = testData.agg(F.mean("merchant_fraud")).collect()[0][0]
mean_predictions_df = testData.select("merchant_fraud").withColumn("prediction", F.lit(mean_value))

mse_evaluator = RegressionEvaluator(labelCol="merchant_fraud", predictionCol="prediction", metricName="mse")
mae_evaluator = RegressionEvaluator(labelCol="merchant_fraud", predictionCol="prediction", metricName="mae")

# Get the score
mse = mse_evaluator.evaluate(mean_predictions_df)
mae = mae_evaluator.evaluate(mean_predictions_df)

print(f"Mean Square Error: {mse}")
print(f"Mean Absolute Error: {mae}")

### Random Forest

We aim to create a simple model, therefore, the model hyperparameters were chosen arbitrarily.

In [None]:
from pyspark.ml import Pipeline

In [None]:
features = 'features'
input_cols = trainingData.drop('merchant_fraud').drop('order_id').drop('order_datetime').drop('merchant_abn').columns

assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

In [None]:
rft = RandomForestRegressor(
    featuresCol='features', 
    labelCol='merchant_fraud',
    numTrees=5, 
    maxDepth=5,
    seed=42
)

In [None]:
pipeline = Pipeline(stages=[assembler, rft])
model = pipeline.fit(trainingData)

- Make predictions.

In [None]:
predictions = model.transform(testData)

- Evaluate the predictions.

In [None]:
mse_evaluator = RegressionEvaluator(labelCol="merchant_fraud", predictionCol="prediction", metricName="mse")
mae_evaluator = RegressionEvaluator(labelCol="merchant_fraud", predictionCol="prediction", metricName="mae")

# Get the score
mse = mse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)

print(f"Mean Square Error: {mse}")
print(f"Mean Absolute Error: {mae}")


In [None]:
result_pd = predictions.select("prediction", "merchant_fraud").toPandas()
plt.scatter(result_pd.merchant_fraud, result_pd.prediction)

### Imputation for merchant fraud probability

In [None]:
imputation_data = impute_merchant_fraud_sdf.filter(F.col('merchant_fraud') == 0).drop('merchant_fraud')

In [None]:
imputed_sdf = model.transform(imputation_data)

In [None]:
imputed_sdf.select("prediction").describe()

We merge imputed merchant fraud probability to the main dataset.

In [None]:
# data with given merchant fraud probability
non_missing_merchant_fraud = model_sdf.select(["order_id", "merchant_fraud"])
imputed_sdf_select = imputed_sdf.select(["order_id", "prediction"]).withColumnRenamed("prediction", "merchant_fraud")

# Combine two dataset
full_merchant_fraud = imputed_sdf_select.union(non_missing_merchant_fraud)

In [None]:
# Save data to file
full_merchant_fraud.write.mode('overwrite').parquet('../data/curated/predicted_merchant_fraud/')

In [None]:
test = spark.read.parquet('../data/curated/predicted_merchant_fraud/')

In [None]:
test.count()

In [None]:
test.limit(5)