In [1]:
from pyspark.sql import functions as F, SparkSession
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd



In [2]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("consumer model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

24/09/27 16:22:02 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.4.51 instead (on interface en0)
24/09/27 16:22:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/27 16:22:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load data
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')
transaction_records = spark.read.parquet('../data/curated/transactions.parquet')
fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fraud_prob.parquet')
personal_fraud = spark.read.csv('../data/curated/personal_fraud.csv', header=True, inferSchema=True)
postcode_info = spark.read.csv('../data/curated/postcode_info.csv', header=True, inferSchema=True)

personal_fraud = personal_fraud.drop(personal_fraud.columns[0])
postcode_info = postcode_info.drop(postcode_info.columns[0])

24/09/27 16:22:24 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

# Prepare data frame for modelling

In [4]:
fraudulent_consumer_with_info = consumer_info.join(fraudulent_consumer_rate, on="consumer_id", how="inner")


In [5]:
# Average fraud probability in each postcode or state
fraudulent_consumer_group_by_postcode = fraudulent_consumer_with_info.groupBy(["postcode"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_postcode"))

fraudulent_consumer_group_by_state = fraudulent_consumer_with_info.groupBy(["state"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_state"))

In [6]:
# Get average fraud prob for each consumer
average_fraudulent_consumer_rate = fraudulent_consumer_rate.groupBy("consumer_id").agg(F.avg("fraud_probability").alias("average_fraud_probability"))

In [7]:
fraudulent_consumer_rate.count()

                                                                                

34864

In [8]:
# number of unique available fraudulent consumer (does not lost any consumer after merging with transaction records)
average_fraudulent_consumer_rate.where(average_fraudulent_consumer_rate["average_fraud_probability"]>0).count()

                                                                                

20128

In [9]:
# Add consumer info to transaction records
consumer_transaction_records = transaction_records.join(consumer_info, on="consumer_id", how="inner")

In [10]:
# order has really high variance and value
consumer_transaction_records.select("dollar_value").summary()

                                                                                

summary,dollar_value
count,12561377.0
mean,166.33982036554346
stddev,520.3624254515656
min,9.756658099412162e-08
25%,26.131201796296963
50%,62.239336842753886
75%,150.40441997997308
max,105193.88578925544


In [11]:
# analysis order value, consider the variance of order value and purchase frequency
consumer_transaction_value_analysis =  consumer_transaction_records.groupBy("consumer_id", "state", "postcode") \
                                        .agg(
                                            F.avg("dollar_value").alias("average_dollar_value"),
                                            F.min("dollar_value").alias("min_dollar_value"),
                                            F.max("dollar_value").alias("max_dollar_value"),
                                            F.count("dollar_value").alias("transaction_count"),
                                            F.stddev("dollar_value").alias("stddev_dollar_value")
                                        )

In [12]:
consumer_transaction_value_analysis.show(3)



+-----------+-----+--------+--------------------+--------------------+-----------------+-----------------+-------------------+
|consumer_id|state|postcode|average_dollar_value|    min_dollar_value| max_dollar_value|transaction_count|stddev_dollar_value|
+-----------+-----+--------+--------------------+--------------------+-----------------+-----------------+-------------------+
|    1144223|  QLD|    4184|    139.415319286921| 0.18241961005131369|2570.932085986033|              517| 231.43064462344782|
|    1463154|  VIC|    3171|   195.7567758780678|0.002626295938134...|9383.286300772766|              508|  639.0572766022099|
|     240762|   NT|     810|  191.08335987971577|0.043988124325446215|4405.580619592672|              537|  446.0698161068607|
+-----------+-----+--------+--------------------+--------------------+-----------------+-----------------+-------------------+
only showing top 3 rows



                                                                                

In [13]:
fraudulent_consumer_summary = average_fraudulent_consumer_rate \
    .join(consumer_transaction_value_analysis, on="consumer_id", how="left") \
    .join(fraudulent_consumer_group_by_postcode, on="postcode", how="inner") \
    .join(fraudulent_consumer_group_by_state, on="state", how="inner")

In [14]:
fraudulent_consumer_summary.count()

                                                                                

20128

### missing 2755 in postcode_info

In [15]:
fraudulent_consumer_summary

                                                                                

state,postcode,consumer_id,average_fraud_probability,average_dollar_value,min_dollar_value,max_dollar_value,transaction_count,stddev_dollar_value,average_fraud_prob_of_postcode,average_fraud_prob_of_state
VIC,3171,1463154,11.878266056831936,195.7567758780678,0.002626295938134...,9383.286300772766,508,639.0572766022099,11.643992863529116,15.162124631050172
NT,810,240762,10.591450552633916,191.08335987971577,0.043988124325446215,4405.580619592672,537,446.0698161068607,10.85295043657941,15.30870879989026
SA,5271,658654,9.979160224757958,157.68630577817763,0.0515263765371069,3672.1862187126712,521,316.4936962413073,16.08544883927027,15.14459123894584
SA,5038,342179,31.92187884980637,158.60145937267436,0.056428449086918686,7924.754329522795,544,450.2467178984997,20.021300006227385,15.14459123894584
NSW,1010,109502,24.54433270717597,155.82556821287568,0.39656267322700633,6080.485779200803,515,388.95545648565377,15.574006413021298,15.133761731460218
NSW,1350,776997,11.75529448002846,187.9270526979037,0.04273264314482915,5483.98493938382,528,467.82653534103486,12.85967507383038,15.133761731460218
VIC,3544,200478,16.164962962897075,143.84227065063257,0.06316666178035278,3834.018657300936,514,271.04404381934063,16.824533417323416,15.162124631050172
SA,5700,1388083,20.1500926310324,176.53490633747415,0.13371196377850894,5334.163999949978,503,407.67597148519087,16.43775655390785,15.14459123894584
NSW,2320,1230206,11.164098609212452,130.42526403234535,0.12886570901148534,2231.049365950385,512,229.13408810294564,14.590538272727914,15.133761731460218
VIC,3188,129565,11.506772272004188,149.0788468074076,0.13578697041492802,2608.142748449603,520,255.5595765689018,22.052019308484137,15.162124631050172


In [16]:
# Get infomation about personal fraud and income from external dataset
postcode_info = postcode_info.drop("state", "long", "lat", "lgacode")
fraudulent_consumer_summary = fraudulent_consumer_summary.join(personal_fraud, on="state", how="inner")
fraudulent_consumer_summary = fraudulent_consumer_summary.join(postcode_info, on="postcode", how="inner")

In [17]:
# Get proportion of the money used to purchase item with respect to income
# average income
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_max_order_value_mean_income", F.col("max_dollar_value") / (F.col("mean_income") * 1.5) )
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_max_order_value_median_income", F.col("max_dollar_value") / (F.col("median_income") * 1.5))

# Total income
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_total_order_value_mean_income", F.col("average_dollar_value") * F.col("transaction_count") / (F.col("mean_income") * 1.5))
fraudulent_consumer_summary = fraudulent_consumer_summary.withColumn("Proportion_between_total_order_value_median_income", F.col("average_dollar_value") * F.col("transaction_count") / (F.col("median_income") * 1.5))



In [18]:
window_spec = Window.orderBy(F.col("average_fraud_probability").desc())
summary_rank_by_fraud_prob = fraudulent_consumer_summary.withColumn("rank", F.rank().over(window_spec))
summary_rank_by_fraud_prob.show(20)

24/09/27 16:24:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/27 16:24:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/27 16:24:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/27 16:24:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/27 16:24:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/27 16:24:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/27 1

+--------+-----+-----------+-------------------------+--------------------+--------------------+------------------+-----------------+-------------------+------------------------------+---------------------------+------------------+-----------+----------+-------------+-----------+----------------------------------------------+------------------------------------------------+------------------------------------------------+--------------------------------------------------+----+
|postcode|state|consumer_id|average_fraud_probability|average_dollar_value|    min_dollar_value|  max_dollar_value|transaction_count|stddev_dollar_value|average_fraud_prob_of_postcode|average_fraud_prob_of_state|victimisation_rate|rse_percent|median_age|median_income|mean_income|Proportion_between_max_order_value_mean_income|Proportion_between_max_order_value_median_income|Proportion_between_total_order_value_mean_income|Proportion_between_total_order_value_median_income|rank|
+--------+-----+-----------+--------

                                                                                

In [19]:
fraudulent_consumer_summary.printSchema()

root
 |-- postcode: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- average_fraud_probability: double (nullable = true)
 |-- average_dollar_value: double (nullable = true)
 |-- min_dollar_value: double (nullable = true)
 |-- max_dollar_value: double (nullable = true)
 |-- transaction_count: long (nullable = true)
 |-- stddev_dollar_value: double (nullable = true)
 |-- average_fraud_prob_of_postcode: double (nullable = true)
 |-- average_fraud_prob_of_state: double (nullable = true)
 |-- victimisation_rate: double (nullable = true)
 |-- rse_percent: double (nullable = true)
 |-- median_age: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- mean_income: double (nullable = true)
 |-- Proportion_between_max_order_value_mean_income: double (nullable = true)
 |-- Proportion_between_max_order_value_median_income: double (nullable = true)
 |-- Proportion_between_total_order_value_mean_income: double (nullable

# Idea
1. Time Frequency feature: https://ieeexplore.ieee.org/document/9399421/

# Modelling

In [20]:
# List of features to be used in the model
features = [
    "average_dollar_value", "min_dollar_value", "max_dollar_value", 
    "transaction_count", "stddev_dollar_value", 
    "average_fraud_prob_of_postcode", "average_fraud_prob_of_state",
    "victimisation_rate", "rse_percent", "Proportion_between_max_order_value_mean_income",
    "Proportion_between_max_order_value_median_income","Proportion_between_total_order_value_mean_income",
    "Proportion_between_total_order_value_median_income"
]

features_dt = [
    "average_dollar_value", "min_dollar_value", "max_dollar_value",
    "average_fraud_prob_of_postcode",
    "Proportion_between_max_order_value_mean_income",
    "Proportion_between_max_order_value_median_income"
]

features_rf = [
    "max_dollar_value", "average_fraud_prob_of_postcode", "average_dollar_value", 
    "rse_percent"
]

features_lr = [
    "average_dollar_value", "min_dollar_value",
    "average_fraud_prob_of_postcode", "average_fraud_prob_of_state",
    "Proportion_between_max_order_value_mean_income",
    "Proportion_between_max_order_value_median_income","Proportion_between_total_order_value_mean_income",
    "Proportion_between_total_order_value_median_income"
]

In [21]:


# VectorAssembler to combine the features into a single vector
assembler_dt = VectorAssembler(inputCols=features_dt, outputCol="features")
assembler_rf = VectorAssembler(inputCols=features_rf, outputCol="features")
assembler_lr = VectorAssembler(inputCols=features_lr, outputCol="features")

# Prepare the data
data_dt = assembler_dt.transform(fraudulent_consumer_summary)
data_rf = assembler_rf.transform(fraudulent_consumer_summary)
data_lr = assembler_lr.transform(fraudulent_consumer_summary)

train_data_dt, test_data_dt = data_dt.randomSplit([0.8, 0.2])
train_data_rf, test_data_rf = data_rf.randomSplit([0.8, 0.2])
train_data_lr, test_data_lr = data_lr.randomSplit([0.8, 0.2])


In [22]:
# Define model regressor
dt = DecisionTreeRegressor(labelCol="average_fraud_probability", featuresCol="features")

rf = RandomForestRegressor(labelCol="average_fraud_probability", featuresCol="features")

lr = LinearRegression(labelCol="average_fraud_probability", featuresCol="features")


In [23]:
# Parameter grid
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [3, 5, 7]) \
    .addGrid(dt.maxBins, [32, 64]) \
    .build()


rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20]) \
    .addGrid(rf.maxDepth, [5, 7]) \
    .build()

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


In [24]:
# Evaluator for regression models
rmse_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability", 
    predictionCol="prediction", 
    metricName="rmse"  
)

r2_evaluator = RegressionEvaluator(
    labelCol="average_fraud_probability",
    predictionCol="prediction",
    metricName="r2" 
)

# Cross-validation 
dt_cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=dt_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

rf_cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=rf_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)

lr_cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=lr_param_grid,
    evaluator=r2_evaluator,
    numFolds=3
)


In [25]:
# Pipeline 
dt_pipeline = Pipeline(stages=[dt_cv])

rf_pipeline = Pipeline(stages=[rf_cv])

lr_pipeline = Pipeline(stages=[lr_cv])


In [None]:
# Fit model

# 6 mins
dt_model = dt_pipeline.fit(train_data_dt)

# 7 mins
rf_model = rf_pipeline.fit(train_data_rf)

# 4 mins
lr_model = lr_pipeline.fit(train_data_lr)


                                                                                

KeyboardInterrupt: 



# Evaluation

In [None]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data_dt)
rf_predictions = rf_model.transform(test_data_rf)
lr_predictions = lr_model.transform(test_data_lr)


dt_rmse = rmse_evaluator.evaluate(dt_predictions)
dt_r2 = r2_evaluator.evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Decision Tree R2: {dt_r2}")           # RMSE: 6.321439678625029 R2: 0.5426608139433955

rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")            # RMSE: 6.2324836442813565 R2: 0.5554417100291846

lr_rmse = rmse_evaluator.evaluate(lr_predictions)
lr_r2 = r2_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Linear Regression R2: {lr_r2}")  # RMSE: 7.031157543875003 R2: 0.434203734698708


                                                                                

Decision Tree RMSE: 6.247875238734437
Decision Tree R2: 0.5488288087235342


                                                                                

Random Forest RMSE: 6.411117021996146
Random Forest R2: 0.5529071262780814




Linear Regression RMSE: 7.146811158435934
Linear Regression R2: 0.448033609548396


                                                                                

`Best model hyperparameters`

In [None]:
best_dt_model = dt_model.stages[-1].bestModel
print(f"Best Decision Tree maxDepth: {best_dt_model._java_obj.getMaxDepth()}")
print(f"Best Decision Tree maxBins: {best_dt_model._java_obj.getMaxBins()}")


best_rf_model = rf_model.stages[-1].bestModel
print(f"Best Random Forest numTrees: {best_rf_model.getNumTrees}")
print(f"Best Random Forest maxDepth: {best_rf_model.getMaxDepth()}")


Best Decision Tree maxDepth: 5
Best Decision Tree maxBins: 64
Best Random Forest numTrees: 10
Best Random Forest maxDepth: 7


### __Feature importances__

In [None]:
feature_names_dt = assembler_dt.getInputCols()
feature_names_rf = assembler_rf.getInputCols()
feature_names_lr = assembler_lr.getInputCols()

`Random forest and decision tree`

In [None]:
best_rf_model = rf_model.stages[0].bestModel
best_dt_model = dt_model.stages[0].bestModel

dt_feature_importances = best_dt_model.featureImportances
rf_feature_importances = best_rf_model.featureImportances

rf_importances_df = pd.DataFrame({
    "Feature": feature_names_rf,
    "Importance": rf_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

dt_importances_df = pd.DataFrame({
    "Feature": feature_names_dt,
    "Importance": dt_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

print(rf_importances_df)
print()
print(dt_importances_df)

                          Feature  Importance
0                max_dollar_value    0.549148
1  average_fraud_prob_of_postcode    0.276619
2            average_dollar_value    0.155635
3                     rse_percent    0.018598

                                            Feature  Importance
2                                  max_dollar_value    0.693190
3                    average_fraud_prob_of_postcode    0.288326
0                              average_dollar_value    0.017747
1                                  min_dollar_value    0.000737
4    Proportion_between_max_order_value_mean_income    0.000000
5  Proportion_between_max_order_value_median_income    0.000000


`Linear regression`

In [None]:

# Get coefficients 
best_lr_model = lr_model.stages[0].bestModel
coefficients = best_lr_model.coefficients
# Get feature names from the VectorAssembler
feature_importances = pd.DataFrame({
    "Feature": feature_names_lr,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print(feature_importances)


                                             Feature  Coefficient
5   Proportion_between_max_order_value_median_income    84.721262
6   Proportion_between_total_order_value_mean_income     2.153166
2                     average_fraud_prob_of_postcode     0.619940
3                        average_fraud_prob_of_state     0.313974
1                                   min_dollar_value     0.295003
0                               average_dollar_value    -0.000177
7  Proportion_between_total_order_value_median_in...    -8.183813
4     Proportion_between_max_order_value_mean_income   -13.668576


In [None]:
rf_predictions.select("average_fraud_probability", "prediction").show(10000)

                                                                                

+-------------------------+------------------+
|average_fraud_probability|        prediction|
+-------------------------+------------------+
|       10.544532437176228|11.264940871373579|
|       10.591450552633916| 11.11954388779451|
|          82.743857764735| 41.45412446937404|
|       16.588065892224208|19.446713399041688|
|       13.474639804806248|17.898593601286173|
|         8.92951589282141| 10.65606572167064|
|        24.54433270717597|15.607045452146503|
|        35.19345663362354|19.792240211573272|
|       13.869132048530782| 10.98289900317772|
|        19.56544083296044| 19.26605539086835|
|       16.924789438758555|14.152065873320561|
|        10.15725716782445|20.768654232985664|
|       16.433381707369506| 14.94984869163919|
|       11.642548275664767|13.850041983690602|
|       19.532316718092023|11.912036747502926|
|         41.1727464447242|25.042902299415353|
|        8.921087412661878|11.856844190493206|
|       16.873473555315883| 16.76752684067496|
|        13.5

24/09/18 01:47:37 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 300677 ms exceeds timeout 120000 ms
24/09/18 01:47:37 WARN SparkContext: Killing executors is not supported by current scheduler.
24/09/18 01:47:43 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$