In [1]:
from pyspark.sql import functions as F, SparkSession
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd

import os
os.sys.path.append("../")
from scripts.consumer_model import *


In [2]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("consumer model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

24/09/28 21:36:27 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.4.51 instead (on interface en0)
24/09/28 21:36:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


In [None]:
# Load data
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')
transaction_records = spark.read.parquet('../data/curated/transactions.parquet')
fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fraud_prob.parquet')
personal_fraud = spark.read.csv('../data/curated/personal_fraud.csv', header=True, inferSchema=True)
postcode_info = spark.read.csv('../data/curated/postcode_info.csv', header=True, inferSchema=True)

personal_fraud = personal_fraud.drop(personal_fraud.columns[0])
postcode_info = postcode_info.drop(postcode_info.columns[0])

# Prepare data frame for modelling

In [None]:
fraudulent_consumer_with_info = consumer_info.join(fraudulent_consumer_rate, on="consumer_id", how="inner")


In [None]:
# Average fraud probability in each postcode or state
fraudulent_consumer_group_by_postcode = fraudulent_consumer_with_info.groupBy(["postcode"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_postcode"))

fraudulent_consumer_group_by_state = fraudulent_consumer_with_info.groupBy(["state"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_state"))

In [None]:
# Get average fraud prob for each consumer
average_fraudulent_consumer_rate = fraudulent_consumer_rate.groupBy("consumer_id").agg(F.avg("fraud_probability").alias("average_fraud_probability"))

In [None]:
fraudulent_consumer_rate.count()

In [None]:
# number of unique available fraudulent consumer (does not lost any consumer after merging with transaction records)
average_fraudulent_consumer_rate.where(average_fraudulent_consumer_rate["average_fraud_probability"]>0).count()

In [None]:
# Add consumer info to transaction records
consumer_transaction_records = transaction_records.join(consumer_info, on="consumer_id", how="inner")

In [None]:
# order has really high variance and value
consumer_transaction_records.select("dollar_value").summary()

In [None]:
# analysis order value, consider the variance of order value and purchase frequency
consumer_transaction_value_analysis =  consumer_transaction_records.groupBy("consumer_id", "state", "postcode") \
                                        .agg(
                                            F.avg("dollar_value").alias("average_dollar_value"),
                                            F.min("dollar_value").alias("min_dollar_value"),
                                            F.max("dollar_value").alias("max_dollar_value"),
                                            F.count("dollar_value").alias("transaction_count"),
                                            F.stddev("dollar_value").alias("stddev_dollar_value")
                                        )

In [None]:
consumer_transaction_value_analysis.show(3)

In [None]:
fraudulent_consumer_summary = average_fraudulent_consumer_rate \
    .join(consumer_transaction_value_analysis, on="consumer_id", how="left") \
    .join(fraudulent_consumer_group_by_postcode, on="postcode", how="inner") \
    .join(fraudulent_consumer_group_by_state, on="state", how="inner")

In [None]:
fraudulent_consumer_summary.count()

### missing 2755 in postcode_info

In [None]:
# Get infomation about personal fraud and income from external dataset
postcode_info = postcode_info.drop("state", "long", "lat", "lgacode")
fraudulent_consumer_summary = fraudulent_consumer_summary.join(personal_fraud, on="state", how="inner")
fraudulent_consumer_summary = fraudulent_consumer_summary.join(postcode_info, on="postcode", how="inner")

In [None]:
# Get proportion of the money used to purchase item with respect to income
# average income
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_max_order_value_mean_income", F.col("max_dollar_value") / (F.col("mean_income") * 1.5) )
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_max_order_value_median_income", F.col("max_dollar_value") / (F.col("median_income") * 1.5))

# Total income
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_total_order_value_mean_income", F.col("average_dollar_value") * F.col("transaction_count") / (F.col("mean_income") * 1.5))
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_total_order_value_median_income", F.col("average_dollar_value") * F.col("transaction_count") / (F.col("median_income") * 1.5))



In [None]:
window_spec = Window.orderBy(F.col("average_fraud_probability").desc())
summary_rank_by_fraud_prob = fraudulent_consumer_summary.withColumn("rank", F.rank().over(window_spec))
summary_rank_by_fraud_prob.show(20)

In [None]:
fraudulent_consumer_summary.printSchema()

# Feature engineering

## Standardizes features

In [None]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.functions import vector_to_array



cols_to_scale = ["min_dollar_value", "max_dollar_value", "stddev_dollar_value","average_dollar_value"]
cols_to_keep_unscaled = [col for col in fraudulent_consumer_summary.columns if col not in cols_to_scale] + ["average_dollar_value"]

assembler = VectorAssembler(inputCols=cols_to_scale, outputCol="features")
sdf_transformed = assembler.transform(fraudulent_consumer_summary)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(sdf_transformed.select("features"))
sdf_scaled = scaler_model.transform(sdf_transformed)
scaled_array_col = vector_to_array(F.col("scaledFeatures"))

# Create new columns for each scaled feature
for i, col in enumerate(cols_to_scale):
    sdf_scaled = sdf_scaled.withColumn(f"scaled_{col}", scaled_array_col[i])

# Combine original Dataframe and the scaled features
dollar_value_df = fraudulent_consumer_summary.select("average_dollar_value")
fraudulent_consumer_summary = sdf_scaled.select(cols_to_keep_unscaled + [f"scaled_{col}" for col in cols_to_scale])


In [None]:
fraudulent_consumer_summary.printSchema()

## Features Log transformation 

In [None]:
cols_to_log = ['scaled_average_dollar_value', 'scaled_min_dollar_value',
                'scaled_max_dollar_value', 'scaled_stddev_dollar_value',
                "Proportion_between_max_order_value_mean_income",
                "Proportion_between_max_order_value_median_income",
                "Proportion_between_total_order_value_mean_income",
                "Proportion_between_total_order_value_median_income"
                ] 


for col in cols_to_log:
    fraudulent_consumer_summary = fraudulent_consumer_summary \
        .withColumn(f'{col}', F.when(fraudulent_consumer_summary[col] > 0, F.log(fraudulent_consumer_summary[col])).otherwise(None))

In [None]:
fraudulent_consumer_summary.printSchema()

# Visualisation

## Assumptions or Observations:
1. The gender plot shows that there is a similar number of male and female consumers.
2. The number of consumers varies significantly across different states.
3. Consumers make a similar number of purchases on each day of the week, whether it’s a weekday or a weekend.
4. Both fraud probability and the dollar value of an order are strongly right-skewed and should be normalized.
5. Proportion features exhibit a linear relationship with fraud probability but may need transformation to clarify this relationship.

In [None]:
# Convert relevant columns to Pandas
df_pandas = fraudulent_consumer_summary.select(
    "average_fraud_probability", "scaled_average_dollar_value", 
    "scaled_min_dollar_value", "scaled_max_dollar_value", "transaction_count", 
    "median_income", "mean_income", "state", "scaled_stddev_dollar_value",
    "Proportion_between_max_order_value_mean_income",
    "Proportion_between_max_order_value_median_income", 
    "Proportion_between_total_order_value_mean_income", 
    "Proportion_between_total_order_value_median_income"
).toPandas()

# Define plots in a dictionary for looping
plots = {
    "Dollar Value Distribution": ("scaled_average_dollar_value", "hist"),
    "Max Dollar Value Distribution": ("scaled_max_dollar_value", "hist"),
    "Min Dollar Value Distribution": ("scaled_min_dollar_value", "hist"),
    "Std Dollar Value Distribution": ("scaled_stddev_dollar_value", "hist"),
    "Fraud Probability Distribution": ("average_fraud_probability", "hist"),
    "Transaction Count Distribution": ("transaction_count", "hist"),
    "State Count": ("state", "count"),
    "Scatter 1 (Max Order Value vs Fraud Prob - Mean Income)": ("Proportion_between_max_order_value_mean_income", "scatter1"),
    "Scatter 2 (Max Order Value vs Fraud Prob - Median Income)": ("Proportion_between_max_order_value_median_income", "scatter2"),
    "Scatter 3 (Total Order Value vs Fraud Prob - Mean Income)": ("Proportion_between_total_order_value_mean_income", "scatter3"),
    "Scatter 4 (Total Order Value vs Fraud Prob - Median Income)": ("Proportion_between_total_order_value_median_income", "scatter4")
}
feature_visualisation(df_pandas, plots)


In [None]:
df_pandas = fraudulent_consumer_summary.select(
    "average_fraud_probability", "scaled_average_dollar_value", 
    "scaled_min_dollar_value", "scaled_max_dollar_value", "transaction_count", 
    "median_income", "mean_income", "scaled_stddev_dollar_value",
    "Proportion_between_max_order_value_mean_income",
    "Proportion_between_max_order_value_median_income", 
    "Proportion_between_total_order_value_mean_income", 
    "Proportion_between_total_order_value_median_income"
).toPandas()
corr_matrix = df_pandas.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Numeric Features")
plt.show()


# Idea
1. Time Frequency feature: https://ieeexplore.ieee.org/document/9399421/

# Modelling

In [None]:
# List of features to be used in the model
features = [
    "scaled_average_dollar_value", "scaled_min_dollar_value", 
    "scaled_max_dollar_value", "transaction_count", 
    "average_fraud_prob_of_postcode", "scaled_stddev_dollar_value",
    "Proportion_between_max_order_value_mean_income",
    "Proportion_between_max_order_value_median_income", 
    "Proportion_between_total_order_value_mean_income", 
    "Proportion_between_total_order_value_median_income"
]

features_dt = [
    "scaled_max_dollar_value", 
    "average_fraud_prob_of_postcode"
]

features_rf = [
    "scaled_max_dollar_value", 
    "average_fraud_prob_of_postcode", 
    "Proportion_between_max_order_value_median_income", 
    "scaled_stddev_dollar_value", 
    "Proportion_between_max_order_value_mean_income", 
    "scaled_average_dollar_value"
]

features_lr = [
    "scaled_stddev_dollar_value", 
    "Proportion_between_max_order_value_median_income", 
    "average_fraud_prob_of_postcode", 
    "Proportion_between_max_order_value_mean_income", 
    "Proportion_between_total_order_value_mean_income", 
    "Proportion_between_total_order_value_median_income", 
    "scaled_average_dollar_value"
]


In [None]:


# VectorAssembler to combine the features into a single vector
assembler_dt = VectorAssembler(inputCols=features_dt, outputCol="features")
assembler_rf = VectorAssembler(inputCols=features_rf, outputCol="features")
assembler_lr = VectorAssembler(inputCols=features_lr, outputCol="features")

# Prepare the data
data_dt = assembler_dt.transform(fraudulent_consumer_summary)
data_rf = assembler_rf.transform(fraudulent_consumer_summary)
data_lr = assembler_lr.transform(fraudulent_consumer_summary)

train_data_dt, test_data_dt = data_dt.randomSplit([0.8, 0.2])
train_data_rf, test_data_rf = data_rf.randomSplit([0.8, 0.2])
train_data_lr, test_data_lr = data_lr.randomSplit([0.8, 0.2])


In [None]:
# Define model regressor
dt = DecisionTreeRegressor(labelCol="average_fraud_probability", featuresCol="features")

rf = RandomForestRegressor(labelCol="average_fraud_probability", featuresCol="features")

lr = LinearRegression(labelCol="average_fraud_probability", featuresCol="features")


In [None]:
# Parameter grid
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 7]) \
    .addGrid(dt.maxBins, [32, 64]) \
    .build()


rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 7]) \
    .build()

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


In [None]:
# Evaluator for regression models
rmse_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability", 
    predictionCol="prediction", 
    metricName="rmse"  
)

r2_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability",
    predictionCol="prediction",
    metricName="r2" 
)

# Cross-validation 
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)


In [None]:
# Pipeline 
dt_pipeline = Pipeline(stages=[dt_cv])

rf_pipeline = Pipeline(stages=[rf_cv])

lr_pipeline = Pipeline(stages=[lr_cv])


In [None]:
# Fit model

# 6 mins
dt_model = dt_pipeline.fit(train_data_dt)


In [None]:
# 7 mins
rf_model = rf_pipeline.fit(train_data_rf)

In [None]:
# 4 mins
lr_model = lr_pipeline.fit(train_data_lr)

# Evaluation

In [None]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data_dt)
rf_predictions = rf_model.transform(test_data_rf)
lr_predictions = lr_model.transform(test_data_lr)


dt_rmse = rmse_evaluator.evaluate(dt_predictions)
dt_r2 = r2_evaluator.evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Decision Tree R2: {dt_r2}")           # RMSE: 6.321439678625029 R2: 0.5426608139433955

rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")            # RMSE: 6.2324836442813565 R2: 0.5554417100291846

lr_rmse = rmse_evaluator.evaluate(lr_predictions)
lr_r2 = r2_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Linear Regression R2: {lr_r2}")  # RMSE: 7.031157543875003 R2: 0.434203734698708


`Best model hyperparameters`

In [None]:
best_dt_model = dt_model.stages[-1].bestModel
print(f"Best Decision Tree maxDepth: {best_dt_model._java_obj.getMaxDepth()}")
print(f"Best Decision Tree maxBins: {best_dt_model._java_obj.getMaxBins()}")


best_rf_model = rf_model.stages[-1].bestModel
print(f"Best Random Forest numTrees: {best_rf_model.getNumTrees}")
print(f"Best Random Forest maxDepth: {best_rf_model.getMaxDepth()}")


### __Feature importances__

In [None]:
feature_names_dt = assembler_dt.getInputCols()
feature_names_rf = assembler_rf.getInputCols()
feature_names_lr = assembler_lr.getInputCols()

`Random forest and decision tree`

In [None]:
best_rf_model = rf_model.stages[0].bestModel
best_dt_model = dt_model.stages[0].bestModel

dt_feature_importances = best_dt_model.featureImportances
rf_feature_importances = best_rf_model.featureImportances

rf_importances_df = pd.DataFrame({
    "Feature": feature_names_rf,
    "Importance": rf_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

dt_importances_df = pd.DataFrame({
    "Feature": feature_names_dt,
    "Importance": dt_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

print(rf_importances_df)
print()
print(dt_importances_df)

`Linear regression`

In [None]:

# Get coefficients 
best_lr_model = lr_model.stages[0].bestModel
coefficients = best_lr_model.coefficients
# Get feature names from the VectorAssembler
feature_importances = pd.DataFrame({
    "Feature": feature_names_lr,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print(feature_importances)


In [None]:
rf_predictions.select("average_fraud_probability", "prediction").show(10000)