In [2]:
from pyspark.sql import functions as F, SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd


In [3]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("consumer model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

24/09/08 21:50:37 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 100.92.15.134 instead (on interface en0)
24/09/08 21:50:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/08 21:50:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Load data
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')
transaction_records = spark.read.parquet('../data/curated/transaction_records.parquet')
fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fraud_rate.parquet')

                                                                                

In [5]:
fraudulent_consumer_with_info = consumer_info.join(fraudulent_consumer_rate, on="consumer_id", how="inner")


In [6]:
# Average fraud probability in each postcode or state
fraudulent_consumer_group_by_postcode = fraudulent_consumer_with_info.groupBy(["postcode"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_postcode"))

fraudulent_consumer_group_by_state = fraudulent_consumer_with_info.groupBy(["state"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_state"))

In [7]:
# Get average fraud prob for each consumer
average_fraudulent_consumer_rate = fraudulent_consumer_rate.groupBy("consumer_id").agg(F.avg("fraud_probability").alias("average_fraud_probability"))

In [8]:
fraudulent_consumer_rate.count()

34864

In [9]:
# number of unique available fraudulent consumer
average_fraudulent_consumer_rate.where(average_fraudulent_consumer_rate["average_fraud_probability"]>0).count()

                                                                                

20128

In [10]:
# Add consumer info to transaction records
consumer_transaction_records = transaction_records.join(consumer_info, on="consumer_id", how="inner")

In [11]:
# order has really high variance and value
consumer_transaction_records.select("dollar_value").summary()

24/09/08 21:50:51 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

summary,dollar_value
count,14195505.0
mean,166.22895390891753
stddev,517.8505602612809
min,9.756658099412162e-08
25%,26.128899475514228
50%,62.22903376686649
75%,150.41282294992547
max,105193.88578925544


In [12]:
# analysis order value, consider the variance of order value and purchase frequency
consumer_transaction_value_analysis =  consumer_transaction_records.groupBy("consumer_id", "state", "postcode") \
                                        .agg(
                                            F.avg("dollar_value").alias("average_dollar_value"),
                                            F.min("dollar_value").alias("min_dollar_value"),
                                            F.max("dollar_value").alias("max_dollar_value"),
                                            F.count("dollar_value").alias("transaction_count"),
                                            F.stddev("dollar_value").alias("stddev_dollar_value")
                                        )

In [13]:
# Prepare data frame for modelling
fraudulent_consumer_summary = average_fraudulent_consumer_rate \
    .join(consumer_transaction_value_analysis, on="consumer_id", how="left") \
    .join(fraudulent_consumer_group_by_postcode, on="postcode", how="inner") \
    .join(fraudulent_consumer_group_by_state, on="state", how="inner")

In [14]:
fraudulent_consumer_summary.show(3)




+-----+--------+-----------+-------------------------+--------------------+--------------------+------------------+-----------------+-------------------+------------------------------+---------------------------+
|state|postcode|consumer_id|average_fraud_probability|average_dollar_value|    min_dollar_value|  max_dollar_value|transaction_count|stddev_dollar_value|average_fraud_prob_of_postcode|average_fraud_prob_of_state|
+-----+--------+-----------+-------------------------+--------------------+--------------------+------------------+-----------------+-------------------+------------------------------+---------------------------+
|  VIC|    3171|    1463154|       11.878266056831936|  187.92238278341324|0.002626295938134...| 9383.286300772766|              578|   601.940558510518|            11.643992863529116|         15.162124631050172|
|   NT|     810|     240762|       10.591450552633916|  187.84754214087144|0.043988124325446215| 4405.580619592672|              591| 434.1325181138

                                                                                

# Idea
1. Time Frequency feature: https://ieeexplore.ieee.org/document/9399421/

# Modelling

In [15]:
# List of features to be used in the model
features = [
    "average_dollar_value", "min_dollar_value", "max_dollar_value", 
    "transaction_count", "stddev_dollar_value", 
    "average_fraud_prob_of_postcode", "average_fraud_prob_of_state"
]

# VectorAssembler to combine the features into a single vector
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Prepare the data
data = assembler.transform(fraudulent_consumer_summary)

train_data, test_data = data.randomSplit([0.8, 0.2])


In [16]:
train_data.select("features").show(1, truncate=False)



+-----------------------------------------------------------------------------------------------------------------------+
|features                                                                                                               |
+-----------------------------------------------------------------------------------------------------------------------+
|[141.6439163612012,0.19560924922318007,3565.3935880300764,594.0,253.34065624200048,17.56001508791763,14.40842251077964]|
+-----------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



                                                                                

In [17]:
# Define model regressor
dt = DecisionTreeRegressor(labelCol="average_fraud_probability", featuresCol="features")

rf = RandomForestRegressor(labelCol="average_fraud_probability", featuresCol="features")

lr = LinearRegression(labelCol="average_fraud_probability", featuresCol="features")


In [18]:
# Parameter grid
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 7]) \
    .addGrid(dt.maxBins, [32, 64]) \
    .build()


rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 7]) \
    .build()

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


In [19]:
# Evaluator for regression models
rmse_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability", 
    predictionCol="prediction", 
    metricName="rmse"  
)

r2_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability",
    predictionCol="prediction",
    metricName="r2" 
)

# Cross-validation 
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_param_grid,
    evaluator=r2_evaluator,
    numFolds=3  # Use 3 folds for cross-validation
)


In [20]:
# Pipeline 
dt_pipeline = Pipeline(stages=[dt_cv])

rf_pipeline = Pipeline(stages=[rf_cv])

lr_pipeline = Pipeline(stages=[lr_cv])


In [34]:
# Fit model

# 6 mins
#dt_model = dt_pipeline.fit(train_data)

# 7 mins
#rf_model = rf_pipeline.fit(train_data)

# 4 mins
lr_model = lr_pipeline.fit(train_data)


24/09/08 22:15:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/09/08 22:15:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/09/08 22:15:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

# Evaluation

In [None]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
lr_predictions = lr_model.transform(test_data)


dt_rmse = rmse_evaluator.evaluate(dt_predictions)
dt_r2 = r2_evaluator.evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Decision Tree R2: {dt_r2}")           # RMSE: 6.93 R2: 0.45

rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")            # RMSE: 6.438397547267348 R2: 0.544903685111079

lr_rmse = rmse_evaluator.evaluate(lr_predictions)
lr_r2 = r2_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Linear Regression R2: {lr_r2}")  # RMSE: 7.547075797936904 R2: 0.400604603315338


`Best model hyperparameters`

In [None]:
# best_dt_model = dt_model.stages[-1].bestModel
# print(f"Best Decision Tree maxDepth: {best_dt_model._java_obj.getMaxDepth()}")
# print(f"Best Decision Tree maxBins: {best_dt_model._java_obj.getMaxBins()}")


# best_rf_model = rf_model.stages[-1].bestModel
# print(f"Best Random Forest numTrees: {best_rf_model.getNumTrees}")
# print(f"Best Random Forest maxDepth: {best_rf_model.getMaxDepth()}")


### __Feature importances__

In [24]:
feature_names = assembler.getInputCols()

`Random forest and decision tree`

In [33]:
best_rf_model = rf_model.stages[0].bestModel
best_dt_model = dt_model.stages[0].bestModel

dt_feature_importances = best_dt_model.featureImportances
rf_feature_importances = best_rf_model.featureImportances

rf_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": rf_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

dt_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": dt_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

print(rf_importances_df)
print()
print(dt_importances_df)

                          Feature  Importance
2                max_dollar_value    0.400778
5  average_fraud_prob_of_postcode    0.300999
4             stddev_dollar_value    0.161478
0            average_dollar_value    0.085434
3               transaction_count    0.022092
1                min_dollar_value    0.020101
6     average_fraud_prob_of_state    0.009117

                          Feature  Importance
2                max_dollar_value    0.665321
5  average_fraud_prob_of_postcode    0.330762
0            average_dollar_value    0.002912
3               transaction_count    0.001005
1                min_dollar_value    0.000000
4             stddev_dollar_value    0.000000
6     average_fraud_prob_of_state    0.000000


`Linear regression`

In [35]:

# Get coefficients 
best_lr_model = lr_model.stages[0].bestModel
coefficients = best_lr_model.coefficients
# Get feature names from the VectorAssembler
feature_importances = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print(feature_importances)


                          Feature  Coefficient
5  average_fraud_prob_of_postcode     0.665090
6     average_fraud_prob_of_state     0.076113
2                max_dollar_value     0.000872
4             stddev_dollar_value     0.000000
3               transaction_count    -0.012840
0            average_dollar_value    -0.042757
1                min_dollar_value    -0.043693
