In [1]:
from pyspark.sql import functions as F, SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd


In [2]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("consumer model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

24/09/10 14:12:51 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.13.18.159 instead (on interface en0)
24/09/10 14:12:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/10 14:12:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load data
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')
transaction_records = spark.read.parquet('../data/curated/transactions.parquet')
fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fraud_prob.parquet')

                                                                                

In [38]:
fraudulent_consumer_rate

order_datetime,fraud_probability,consumer_id
2022-02-20,9.80543113652096,1195503
2021-08-30,9.599513915425788,179208
2021-09-25,10.069850934775245,179208
2021-11-03,8.300636455314633,1194530
2021-10-09,9.63330241109042,154128
2022-02-08,9.02022421158597,712975
2021-10-04,10.868364868449886,712975
2022-01-11,27.496186536467164,712975
2021-12-12,10.459280127078758,407340
2021-11-17,8.531261989227714,650435


In [36]:
transaction_records.join(fraudulent_consumer_rate, on=["order_datetime", "consumer_id"], how="inner")

order_datetime,consumer_id,merchant_abn,dollar_value,order_id,fraud_probability
2021-11-26,838663,43725628716,2684.361417897887,52133cb6-4950-44c...,9.241197906604826
2021-11-26,838663,45629217853,0.1960776257498843,e5b1880d-fb88-4fd...,9.241197906604826
2021-11-26,838663,73499119023,9.260984119033456,a576a634-dcd0-445...,9.241197906604826
2021-11-26,640907,30623214058,111.91165017514544,362fc544-e9c0-449...,15.645332445256162
2021-11-26,640907,79417999332,54.89560911946132,3c246238-a15f-429...,15.645332445256162
2021-11-26,640907,18305506006,5.198633525613379,e4cb215d-26d8-4e4...,15.645332445256162
2021-11-26,640907,96244711717,52.48791119040359,3a3c80b1-be38-447...,15.645332445256162
2021-11-26,640907,34967436738,4298.699244530939,2630e79b-6148-4b3...,15.645332445256162
2021-11-26,640907,16248082282,361.5545523024534,8f448a5a-fe7c-4fc...,15.645332445256162
2021-11-26,192322,45629217853,26.34821606933028,99c767d9-811b-4cd...,19.10895425125948


In [4]:
fraudulent_consumer_with_info = consumer_info.join(fraudulent_consumer_rate, on="consumer_id", how="inner")


In [5]:
# Average fraud probability in each postcode or state
fraudulent_consumer_group_by_postcode = fraudulent_consumer_with_info.groupBy(["postcode"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_postcode"))

fraudulent_consumer_group_by_state = fraudulent_consumer_with_info.groupBy(["state"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_state"))

In [6]:
# Get average fraud prob for each consumer
average_fraudulent_consumer_rate = fraudulent_consumer_rate.groupBy("consumer_id").agg(F.avg("fraud_probability").alias("average_fraud_probability"))

In [7]:
fraudulent_consumer_rate.count()

34864

In [8]:
# number of unique available fraudulent consumer
average_fraudulent_consumer_rate.where(average_fraudulent_consumer_rate["average_fraud_probability"]>0).count()

                                                                                

20128

In [9]:
# Add consumer info to transaction records
consumer_transaction_records = transaction_records.join(consumer_info, on="consumer_id", how="inner")

In [10]:
# order has really high variance and value
consumer_transaction_records.select("dollar_value").summary()

24/09/10 14:13:05 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

summary,dollar_value
count,12561377.0
mean,166.33982036554346
stddev,520.3624254515656
min,9.756658099412162e-08
25%,26.131201796296963
50%,62.239336842753886
75%,150.40441997997308
max,105193.88578925544


In [11]:
# analysis order value, consider the variance of order value and purchase frequency
consumer_transaction_value_analysis =  consumer_transaction_records.groupBy("consumer_id", "state", "postcode") \
                                        .agg(
                                            F.avg("dollar_value").alias("average_dollar_value"),
                                            F.min("dollar_value").alias("min_dollar_value"),
                                            F.max("dollar_value").alias("max_dollar_value"),
                                            F.count("dollar_value").alias("transaction_count"),
                                            F.stddev("dollar_value").alias("stddev_dollar_value")
                                        )

In [12]:
# Prepare data frame for modelling
fraudulent_consumer_summary = average_fraudulent_consumer_rate \
    .join(consumer_transaction_value_analysis, on="consumer_id", how="left") \
    .join(fraudulent_consumer_group_by_postcode, on="postcode", how="inner") \
    .join(fraudulent_consumer_group_by_state, on="state", how="inner")

In [13]:
fraudulent_consumer_summary.show(3)




+-----+--------+-----------+-------------------------+--------------------+--------------------+------------------+-----------------+-------------------+------------------------------+---------------------------+
|state|postcode|consumer_id|average_fraud_probability|average_dollar_value|    min_dollar_value|  max_dollar_value|transaction_count|stddev_dollar_value|average_fraud_prob_of_postcode|average_fraud_prob_of_state|
+-----+--------+-----------+-------------------------+--------------------+--------------------+------------------+-----------------+-------------------+------------------------------+---------------------------+
|  VIC|    3171|    1463154|       11.878266056831936|   195.7567758780678|0.002626295938134...| 9383.286300772766|              508|  639.0572766022099|            11.643992863529116|         15.162124631050172|
|   NT|     810|     240762|       10.591450552633916|  191.08335987971577|0.043988124325446215| 4405.580619592672|              537|  446.069816106

                                                                                

# Idea
1. Time Frequency feature: https://ieeexplore.ieee.org/document/9399421/

# Modelling

In [14]:
# List of features to be used in the model
features = [
    "average_dollar_value", "min_dollar_value", "max_dollar_value", 
    "transaction_count", "stddev_dollar_value", 
    "average_fraud_prob_of_postcode", "average_fraud_prob_of_state"
]

# VectorAssembler to combine the features into a single vector
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Prepare the data
data = assembler.transform(fraudulent_consumer_summary)

train_data, test_data = data.randomSplit([0.8, 0.2])


In [15]:
train_data.select("features").show(1, truncate=False)



+----------------------------------------------------------------------------------------------------------------------+
|features                                                                                                              |
+----------------------------------------------------------------------------------------------------------------------+
|[176.18461160918955,0.4200595653096262,8936.232778956677,515.0,511.0334385468561,13.054126033171633,14.40842251077964]|
+----------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



                                                                                

In [16]:
# Define model regressor
dt = DecisionTreeRegressor(labelCol="average_fraud_probability", featuresCol="features")

rf = RandomForestRegressor(labelCol="average_fraud_probability", featuresCol="features")

lr = LinearRegression(labelCol="average_fraud_probability", featuresCol="features")


In [17]:
# Parameter grid
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 7]) \
    .addGrid(dt.maxBins, [32, 64]) \
    .build()


rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 7]) \
    .build()

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


In [18]:
# Evaluator for regression models
rmse_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability", 
    predictionCol="prediction", 
    metricName="rmse"  
)

r2_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability",
    predictionCol="prediction",
    metricName="r2" 
)

# Cross-validation 
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_param_grid,
    evaluator=r2_evaluator,
    numFolds=3  # Use 3 folds for cross-validation
)


In [19]:
# Pipeline 
dt_pipeline = Pipeline(stages=[dt_cv])

rf_pipeline = Pipeline(stages=[rf_cv])

lr_pipeline = Pipeline(stages=[lr_cv])


In [20]:
# Fit model

# 6 mins
dt_model = dt_pipeline.fit(train_data)

# 7 mins
rf_model = rf_pipeline.fit(train_data)

# 4 mins
lr_model = lr_pipeline.fit(train_data)


24/09/10 14:23:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/09/10 14:23:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/09/10 14:23:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

# Evaluation

In [21]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
lr_predictions = lr_model.transform(test_data)


dt_rmse = rmse_evaluator.evaluate(dt_predictions)
dt_r2 = r2_evaluator.evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Decision Tree R2: {dt_r2}")           # RMSE: 6.321439678625029 R2: 0.5426608139433955

rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")            # RMSE: 6.2324836442813565 R2: 0.5554417100291846

lr_rmse = rmse_evaluator.evaluate(lr_predictions)
lr_r2 = r2_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Linear Regression R2: {lr_r2}")  # RMSE: 7.031157543875003 R2: 0.434203734698708


                                                                                

Decision Tree RMSE: 6.321439678625029
Decision Tree R2: 0.5426608139433955


                                                                                

Random Forest RMSE: 6.2324836442813565
Random Forest R2: 0.5554417100291846




Linear Regression RMSE: 7.031157543875003
Linear Regression R2: 0.434203734698708


                                                                                

`Best model hyperparameters`

In [26]:
best_dt_model = dt_model.stages[-1].bestModel
print(f"Best Decision Tree maxDepth: {best_dt_model._java_obj.getMaxDepth()}")
print(f"Best Decision Tree maxBins: {best_dt_model._java_obj.getMaxBins()}")


best_rf_model = rf_model.stages[-1].bestModel
print(f"Best Random Forest numTrees: {best_rf_model.getNumTrees}")
print(f"Best Random Forest maxDepth: {best_rf_model.getMaxDepth()}")


Best Decision Tree maxDepth: 5
Best Decision Tree maxBins: 64
Best Random Forest numTrees: 20
Best Random Forest maxDepth: 7


### __Feature importances__

In [23]:
feature_names = assembler.getInputCols()

`Random forest and decision tree`

In [24]:
best_rf_model = rf_model.stages[0].bestModel
best_dt_model = dt_model.stages[0].bestModel

dt_feature_importances = best_dt_model.featureImportances
rf_feature_importances = best_rf_model.featureImportances

rf_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": rf_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

dt_importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": dt_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

print(rf_importances_df)
print()
print(dt_importances_df)

                          Feature  Importance
2                max_dollar_value    0.383732
5  average_fraud_prob_of_postcode    0.249865
4             stddev_dollar_value    0.226143
0            average_dollar_value    0.090765
3               transaction_count    0.021227
1                min_dollar_value    0.019103
6     average_fraud_prob_of_state    0.009165

                          Feature  Importance
2                max_dollar_value    0.703590
5  average_fraud_prob_of_postcode    0.279638
0            average_dollar_value    0.015683
4             stddev_dollar_value    0.000978
1                min_dollar_value    0.000112
3               transaction_count    0.000000
6     average_fraud_prob_of_state    0.000000


`Linear regression`

In [25]:

# Get coefficients 
best_lr_model = lr_model.stages[0].bestModel
coefficients = best_lr_model.coefficients
# Get feature names from the VectorAssembler
feature_importances = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print(feature_importances)


                          Feature  Coefficient
5  average_fraud_prob_of_postcode     0.603199
1                min_dollar_value     0.273245
6     average_fraud_prob_of_state     0.153361
2                max_dollar_value     0.000983
4             stddev_dollar_value     0.000000
3               transaction_count    -0.017589
0            average_dollar_value    -0.049920


In [30]:
rf_predictions.select("average_fraud_probability", "prediction").show(10000)

                                                                                

+-------------------------+------------------+
|average_fraud_probability|        prediction|
+-------------------------+------------------+
|        9.473181948042622| 12.18927935112915|
|        11.49513732973274| 11.71641928784909|
|        33.18630107727179|19.557708030362992|
|        12.57865153563013|11.832442387136975|
|       12.568961988201659|14.622638075995198|
|       10.656490538693927|12.998952135472658|
|       13.258235584580708|11.334913934976008|
|       12.959974799338081| 11.77849476654217|
|       14.822301415521576|12.938976006917695|
|        25.06049099242739|26.780554346300942|
|        17.55587770553267| 16.71016275274032|
|        19.56544083296044|18.393791634933184|
|        9.607005444110502|12.734369975319323|
|        16.91097618557946|16.870205970511165|
|       11.658766388823688| 14.89245274764572|
|        9.640300666594097|10.832808800836863|
|       25.054252963067885|22.402784140970795|
|       16.600460417885063|14.362034268140428|
|        9.20