In [1]:
import os
os.sys.path.append("../")
from scripts.consumer_transaction_model import *

In [2]:
os.environ['PYSPARK_PYTHON'] = "/usr/local/bin/python3.11"
os.environ['PYSPARK_DRIVER_PYTHON'] = "/usr/local/bin/python3.11"

In [3]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("consumer transaction model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/09/30 18:59:14 WARN Utils: Your hostname, Skye-Ngu resolves to a loopback address: 127.0.1.1; using 172.17.250.30 instead (on interface eth0)
24/09/30 18:59:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/30 18:59:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Loading the necessary data
consumer_info = spark.read.parquet('../data/curated/consumer_info.parquet')

transaction_records = spark.read.parquet('../data/curated/transactions.parquet')
transaction_records = transaction_records.drop("name") # drop name so it doesn't conflict when merge with consumer

fraudulent_consumer_rate = spark.read.parquet('../data/curated/consumer_fp.parquet')

personal_fraud = spark.read.csv('../data/curated/personal_fraud.csv', header=True, inferSchema=True)
postcode_info = spark.read.csv('../data/curated/postcode_info.csv', header=True, inferSchema=True)

personal_fraud = personal_fraud.drop(personal_fraud.columns[0])
postcode_info = postcode_info.drop(postcode_info.columns[0])

                                                                                

# Feature Engineering

We will do some feature engineering and data aggregation as we believe that the current data we have on each transactions isn't enough for us to accurately predict the fraud probability.

In [5]:
# Add consumer info to transaction records
transaction_fraudulent_consumer_with_info = transaction_records.join(consumer_info, on="consumer_id", how="inner")
transaction_fraudulent_consumer_with_info = transaction_fraudulent_consumer_with_info.drop(
    "merchant_abn", "merchant_fp", "category", "revenue_level", "take_rate"
)
transaction_fraudulent_consumer_with_info.limit(5)

                                                                                

consumer_id,order_datetime,dollar_value,order_id,consumer_fp,name,gender,state,postcode
148519,2021-08-20,49.86466328799216,df791327-3a6f-40b...,,Noah Cooper,Male,WA,6900
910763,2021-08-20,84.58217700485652,28e3bacf-3a2e-44c...,,David Campbell,Undisclosed,SA,5240
434658,2021-08-15,44.57740577259972,fd3dfa2c-837a-41e...,,Amanda Martinez,Female,VIC,3390
1432260,2021-08-15,80.83810604716545,4ca8e17d-3169-499...,,Alexis Obrien,Female,SA,5642
1048583,2021-07-15,19.841302487451244,b106a1c6-7e2c-4ba...,,David Padilla DDS,Male,ACT,2905


Each consumer's area of living will have their associated fraud probability. The state and postcode average fraud probability will be useful features.

In [6]:
# Average fraud probability in each postcode or state
consumer_info_with_fp = consumer_info.join(fraudulent_consumer_rate, on = 'consumer_id', how = 'inner')

fraudulent_consumer_group_by_postcode = consumer_info_with_fp.groupBy(["postcode"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_postcode"))

fraudulent_consumer_group_by_state = consumer_info_with_fp.groupBy(["state"]).agg(F.avg("fraud_probability").alias("average_fraud_prob_of_state"))

We believe that consumer with fluctutating buying behaviour, i.e high standard deviation accross all order, are potentially commiting fraud.

In [7]:
# analysis order value, consider the variance of order value and purchase frequency
consumer_transaction_value_analysis =  transaction_fraudulent_consumer_with_info.groupBy("consumer_id", "state", "postcode") \
                                        .agg(
                                            F.avg("dollar_value").alias("average_dollar_value"),
                                            F.min("dollar_value").alias("min_dollar_value"),
                                            F.max("dollar_value").alias("max_dollar_value"),
                                            F.count("dollar_value").alias("transaction_count"),
                                            F.stddev("dollar_value").alias("stddev_dollar_value")
                                        )

# consumer_transaction_value_analysis.limit(10)

Consumer with high standard deviation in the dollar value of their transactions may be suspicious as that mean their shopping habid varies a lot.

In [8]:
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_with_info \
    .join(consumer_transaction_value_analysis, on=["consumer_id", "state","postcode"], how="left") \
    .join(fraudulent_consumer_group_by_postcode, on="postcode", how="inner") \
    .join(fraudulent_consumer_group_by_state, on="state", how="inner")

Since we have data on personal fraud rate and income from each postcode, we can use it to help predicting consumer fraud probability. We will also create a feature that calculate the proportion of the mean/median income of the consumer's respective location that is used for making transactions. We think that it is unreasonable for a person to spend more than 70% of their annual salary on purchasing items as that would mean they wouldn't have enough money for other neccessity.

In [9]:
# Get infomation on personal fraud and income from external dataset
postcode_info = postcode_info.drop("state", "long", "lat", "lgacode")
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.join(personal_fraud, on="state", how="inner")
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.join(postcode_info, on="postcode", how="inner")

In [10]:
# Get proportion of the money used to purchase item with respect to income (one and a half year)
# average income
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("Proportion_between_max_order_value_mean_income", F.col("max_dollar_value") / (F.col("mean_income") * 1.5) )
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("Proportion_between_max_order_value_median_income", F.col("max_dollar_value") / (F.col("median_income") * 1.5))

# Total income
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("Proportion_between_total_order_value_mean_income", F.col("average_dollar_value") * F.col("transaction_count") / (F.col("mean_income") * 1.5))
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("Proportion_between_total_order_value_median_income", F.col("average_dollar_value") * F.col("transaction_count") / (F.col("median_income") * 1.5))


As prediting consumer's fraud probability on a transactional level is our main goal, we suspect that there is a temporal relationship between fraud probability and the month, date of purchase. Thus, we will split the `order_datetime` column into month, day (Monday - Sunday).

We also introduce a feature that indicates the number of order that a customer made in the previous week. Hence, we will train our model on 2021-03-07, which is 6 days before the first date of entry.

In [11]:
# Convert 'order_datetime' from string to date format
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("order_datetime", F.to_date("order_datetime", "yyyy-MM-dd"))
cutoff_date = "2021-03-07"
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.filter(F.col("order_datetime") >= F.lit(cutoff_date))

# Add a new column 'transaction_count_last_n_days' that counts the transactions within n days before each transaction
window_spec = Window.partitionBy("consumer_id").orderBy(F.col("order_datetime").cast("long")) \
    .rangeBetween(-7 * 86400, 0)  # 7 days in seconds (86400 seconds = 1 day)

transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("transaction_count_last_7_days", F.count("order_datetime").over(window_spec))

In [12]:
# Return the corresponding day of the week for the given date in the DataFrame.
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("day_of_week", F.dayofweek("order_datetime"))
transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("is_weekend", F.when((F.col("day_of_week") == 7) | (F.col("day_of_week") == 1), 1).otherwise(0))

transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary.withColumn("month", F.month("order_datetime"))


In [13]:
monthly_summary = transaction_fraudulent_consumer_summary.withColumn("month", F.month("order_datetime")) \
    .groupBy("month") \
    .agg(
        F.count("*").alias("count"),
        F.avg("consumer_fp").alias("average_fraud_probability")
    )

monthly_summary.show()



+-----+-------+-------------------------+
|month|  count|average_fraud_probability|
+-----+-------+-------------------------+
|   12| 910143|        14.34681743793918|
|    1| 525394|       14.656880079210088|
|    6|1312987|       13.901543295083838|
|    3|1051393|       14.737159007907232|
|    5|1329977|        14.76406188605546|
|    9| 644670|       14.601646302577457|
|    4|1174513|       14.588978233806921|
|    8|1365693|       14.731618998459677|
|    7|1375524|       14.511252271710648|
|   10| 693724|       14.602753834274731|
|   11| 942185|       14.623580500547867|
|    2| 504446|       14.772366739280775|
+-----+-------+-------------------------+



                                                                                

We can see that the average fraud probability of each month doesn't vary a lot. This may suggest that there is little to no temporal relationship between consumer's fraud probability and month. Though, during our model fitting, we will stil include this feature in the model and will check the feature importance.

During our preliminary analysis, we found out that the distribution of the dollar value for the transactions are heavily right-skewed even after a log-transformation. Thus, we will do a log-transformation on the feature `dollar_value` as well as any other features that are related to it, and then normalise for better comparision.

In [14]:
# # Applying log-transformation

# cols_to_log = ['dollar_value', 'average_dollar_value', 'min_dollar_value',
#                 'max_dollar_value', 'stddev_dollar_value',
#                 "Proportion_between_max_order_value_mean_income",
#                 "Proportion_between_max_order_value_median_income",
#                 "Proportion_between_total_order_value_mean_income",
#                 "Proportion_between_total_order_value_median_income"
#                 ] 


# for col in cols_to_log:
#     transaction_fraudulent_consumer_summary = transaction_fraudulent_consumer_summary \
#         .withColumn(f'{col}', F.when(transaction_fraudulent_consumer_summary[col] > 0, F.log(transaction_fraudulent_consumer_summary[col])).otherwise(None))

In [15]:
# # Standardising/ Normalising feature
# cols_to_scale = ["dollar_value", "min_dollar_value", "max_dollar_value", "stddev_dollar_value","average_dollar_value"]
# cols_to_keep_unscaled = [col for col in transaction_fraudulent_consumer_summary.columns if col not in cols_to_scale] + ["dollar_value"]

# assembler = VectorAssembler(inputCols=cols_to_scale, outputCol="features")
# sdf_transformed = assembler.transform(transaction_fraudulent_consumer_summary)
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
# scaler_model = scaler.fit(sdf_transformed.select("features"))
# sdf_scaled = scaler_model.transform(sdf_transformed)
# scaled_array_col = vector_to_array(F.col("scaledFeatures"))

# # Create new columns for each scaled feature
# for i, col in enumerate(cols_to_scale):
#     sdf_scaled = sdf_scaled.withColumn(f"scaled_{col}", scaled_array_col[i])

# # Combine original Dataframe and the scaled features
# dollar_value_df = transaction_fraudulent_consumer_summary.select("dollar_value")
# transaction_fraudulent_consumer_summary = sdf_scaled.select(cols_to_keep_unscaled + [f"scaled_{col}" for col in cols_to_scale])


# Visualisation

## Assumptions or Observations:
1. The gender plot shows that there is a similar number of male and female consumers.
2. The number of consumers varies significantly across different states.
3. Consumers make a similar number of purchases on each day of the week, whether it’s a weekday or a weekend.
4. Both fraud probability and the dollar value of an order are strongly right-skewed and should be normalized.
5. Proportion features exhibit a linear relationship with fraud probability but may need transformation to clarify this relationship.

In [16]:
# # Convert relevant columns to Pandas
# df_pandas = transaction_fraudulent_consumer_summary.select(
#     "dollar_value", "scaled_dollar_value", "consumer_fp", "scaled_average_dollar_value", 
#     "scaled_min_dollar_value", "scaled_max_dollar_value", "transaction_count", 
#     "median_income", "mean_income", "state", "gender", "scaled_stddev_dollar_value",
#     "day_of_week", "is_weekend", "Proportion_between_max_order_value_mean_income",
#     "Proportion_between_max_order_value_median_income", 
#     "Proportion_between_total_order_value_mean_income", 
#     "Proportion_between_total_order_value_median_income"
# ).toPandas()

# # Define plots in a dictionary for looping
# plots = {
#     "Dollar Value Distribution": ("dollar_value", "hist"),
#     "Scaled Dollar Value Distribution": ("scaled_dollar_value", "hist"),
#     "Max Dollar Value Distribution": ("scaled_max_dollar_value", "hist"),
#     "Min Dollar Value Distribution": ("scaled_min_dollar_value", "hist"),
#     "Std Dollar Value Distribution": ("scaled_stddev_dollar_value", "hist"),
#     "Average Dollar Value Distribution": ("scaled_average_dollar_value", "hist"),
#     "Fraud Probability Distribution": ("consumer_fp", "hist"),
#     "Transaction Count Distribution": ("transaction_count", "hist"),
#     "Gender Count": ("gender", "count"),
#     "State Count": ("state", "count"),
#     "Day of Week Count": ("day_of_week", "count"),
#     "Is Weekend Count": ("is_weekend", "count"),
#     "Scatter 1 (Max Order Value vs Fraud Prob - Mean Income)": ("Proportion_between_max_order_value_mean_income", "scatter1"),
#     "Scatter 2 (Max Order Value vs Fraud Prob - Median Income)": ("Proportion_between_max_order_value_median_income", "scatter2"),
#     "Scatter 3 (Total Order Value vs Fraud Prob - Mean Income)": ("Proportion_between_total_order_value_mean_income", "scatter3"),
#     "Scatter 4 (Total Order Value vs Fraud Prob - Median Income)": ("Proportion_between_total_order_value_median_income", "scatter4")
# }
# feature_visualisation(df_pandas, plots)


In [17]:
# df_pandas = transaction_fraudulent_consumer_summary.select(
#     "scaled_dollar_value", "fraud_probability", "scaled_average_dollar_value", 
#     "scaled_min_dollar_value", "scaled_max_dollar_value", "transaction_count", 
#     "median_income", "mean_income", "Proportion_between_max_order_value_mean_income",
#     "Proportion_between_max_order_value_median_income", 
#     "Proportion_between_total_order_value_mean_income", 
#     "Proportion_between_total_order_value_median_income"
# ).toPandas()
# corr_matrix = df_pandas.corr()
# plt.figure(figsize=(12, 8))
# sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Correlation Heatmap of Numeric Features")
# plt.show()


# Idea
1. Time Frequency feature: https://ieeexplore.ieee.org/document/9399421/

# Modelling

First, let's split our train data and the data that we want to predict.

In [29]:
train_data = transaction_fraudulent_consumer_summary.filter(F.col("consumer_fp").isNotNull())
predict_data = transaction_fraudulent_consumer_summary.filter(F.col("consumer_fp").isNull())

We will be using 2 regression model, one is Random Forest Regression (RFR) and the other is Linear Regression (LR). We will use LR as the baseline model to compare with RFR.

In [35]:
# List of feature to be used in the model
features_rf = ["norm_dollar_value", "norm_max_dollar_value","average_fraud_prob_of_postcode", "norm_stddev_dollar_value", "Proportion_between_max_order_value_median_income",
               "Proportion_between_max_order_value_mean_income", "transaction_count_last_7_days", "month_index", "weekday_index", "is_weekend_vector"]

features_lr = ['norm_dollar_value', 'norm_average_dollar_value', 'norm_stddev_dollar_value', 'average_fraud_prob_of_postcode', 'Proportion_between_total_order_value_mean_income',
               'Proportion_between_max_order_value_median_income', 'Proportion_between_max_order_value_mean_income', 'month_index', 'weekday_index', 'is_weekend_vector',
               'transaction_count_last_7_days']

In [36]:
assembled_train_data_rf, _ = assemble_data(train_data, features_rf)
assembled_train_data_lr, _ = assemble_data(train_data, features_lr)

                                                                                

In [37]:
train_set_rf, validate_set_rf = assembled_train_data_rf.randomSplit([0.8, 0.2], seed=123)
train_set_lr, validate_set_lr = assembled_train_data_lr.randomSplit([0.8, 0.2], seed=123)

In [38]:
# Parameter grid
rf_paramGrid = ParamGridBuilder() \
    .addGrid(RandomForestRegressor(labelCol='consumer_fp', featuresCol='features').numTrees, [10, 20, 40]) \
    .addGrid(RandomForestRegressor(labelCol='consumer_fp', featuresCol='features').maxDepth, [5, 10, 12]) \
    .build()

rf_evaluator = RegressionEvaluator(labelCol="consumer_fp", predictionCol="prediction")

rf_crossval = CrossValidator(estimator=RandomForestRegressor(labelCol='consumer_fp', featuresCol='features'),
                          estimatorParamMaps=rf_paramGrid,
                          evaluator=rf_evaluator,
                          numFolds=2)

rf_model = rf_crossval.fit(train_set_rf)
rf_predictions = rf_model.transform(validate_set_rf)
rf_rmse = rf_evaluator.evaluate(rf_predictions)

                                                                                

In [39]:
rf_best_model = rf_model.bestModel
print(f"Best Model RMSE on test data = {rf_rmse}")
print(f"Best number of trees: {rf_best_model.getNumTrees}") 
print(f"Best max depth: {rf_best_model.getMaxDepth()}") 
print(f"Best max bins: {rf_best_model.getMaxBins()}")

Best Model RMSE on test data = 6.677996490397115
Best number of trees: 20
Best max depth: 5
Best max bins: 32


In [42]:
lr_paramGrid = ParamGridBuilder() \
    .addGrid(LinearRegression(labelCol="consumer_fp", featuresCol="features").regParam, [0.01, 0.1, 1.0]) \
    .addGrid(LinearRegression(labelCol="consumer_fp", featuresCol="features").elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()
    
lr_evaluator = RegressionEvaluator(labelCol="consumer_fp", predictionCol="prediction")

lr_crossval = CrossValidator(estimator=LinearRegression(labelCol="consumer_fp", featuresCol="features"),
                          estimatorParamMaps=lr_paramGrid,
                          evaluator=lr_evaluator,
                          numFolds=2)

lr_model = lr_crossval.fit(train_set_lr)
lr_predictions = lr_model.transform(validate_set_lr)
lr_rmse = lr_evaluator.evaluate(lr_predictions)

24/09/30 19:51:21 WARN Instrumentation: [34346f3a] regParam is zero, which might cause numerical instability and overfitting.
24/09/30 19:51:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/09/30 19:51:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/09/30 19:51:43 WARN Instrumentation: [e6538f95] regParam is zero, which might cause numerical instability and overfitting.
24/09/30 19:51:45 WARN Instrumentation: [7f050de7] regParam is zero, which might cause numerical instability and overfitting.
24/09/30 19:51:46 WARN Instrumentation: [55286e50] regParam is zero, which might cause numerical instability and overfitting.
24/09/30 19:51:47 WARN Instrumentation: [17043593] regParam is zero, which might cause numerical instability and overfitting.
24/09/30 19:51:49 WARN Instrumentation: [27dced35] regParam is zero, which might cause numerical instability and overfitting.
24/09/30 19:51:50 WARN Ins

In [44]:
lr_best_model = lr_model.bestModel
print(f"Best Model RMSE on test data = {lr_rmse}")
# print(f"Best number of trees: {lr_best_model.getNumTrees}") 
# print(f"Best max depth: {lr_best_model.getMaxDepth()}") 
# print(f"Best max bins: {lr_best_model.getMaxBins()}")

Best Model RMSE on test data = 7.6987052052664655


24/09/30 20:25:34 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1497592 ms exceeds timeout 120000 ms
24/09/30 20:25:34 WARN SparkContext: Killing executors is not supported by current scheduler.
24/09/30 20:25:40 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

# Evaluation

In [33]:
rf_predictions = rf_model.transform(validate_set_rf)
rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")  



Random Forest RMSE: 6.534080453225833
Random Forest R2: 0.471538481981881


                                                                                

In [None]:
# Make predictions on the test data
dt_predictions = dt_model.transform(test_data_dt)
rf_predictions = rf_model.transform(test_data_rf)
lr_predictions = lr_model.transform(test_data_lr)


dt_rmse = rmse_evaluator.evaluate(dt_predictions)
dt_r2 = r2_evaluator.evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Decision Tree R2: {dt_r2}")         

rf_rmse = rmse_evaluator.evaluate(rf_predictions)
rf_r2 = r2_evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R2: {rf_r2}")           

lr_rmse = rmse_evaluator.evaluate(lr_predictions)
lr_r2 = r2_evaluator.evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Linear Regression R2: {lr_r2}")  



`Best model hyperparameters`

In [35]:
# best_dt_model = dt_model.stages[-1].bestModel
# print(f"Best Decision Tree maxDepth: {best_dt_model._java_obj.getMaxDepth()}")
# print(f"Best Decision Tree maxBins: {best_dt_model._java_obj.getMaxBins()}")


best_rf_model = rf_model.stages[-1].bestModel
print(f"Best Random Forest numTrees: {best_rf_model.getNumTrees}")
print(f"Best Random Forest maxDepth: {best_rf_model.getMaxDepth()}")


Best Random Forest numTrees: 20
Best Random Forest maxDepth: 7


### __Feature importances__

In [36]:
# feature_names_dt = assembler_dt.getInputCols()
feature_names_rf = assembler_rf.getInputCols()
# feature_names_lr = assembler_lr.getInputCols()

`Random forest and decision tree`

In [39]:
best_rf_model = rf_model.stages[0].bestModel
# best_dt_model = dt_model.stages[0].bestModel

# dt_feature_importances = best_dt_model.featureImportances
rf_feature_importances = best_rf_model.featureImportances

rf_importances_df = pd.DataFrame({
    "Feature": feature_names_rf,
    "Importance": rf_feature_importances.toArray()
}).sort_values(by="Importance", ascending=False)

# dt_importances_df = pd.DataFrame({
#     "Feature": feature_names_dt,
#     "Importance": dt_feature_importances.toArray()
# }).sort_values(by="Importance", ascending=False)

print(rf_importances_df)
# print()
# print(dt_importances_df)

                                            Feature  Importance
0                               scaled_dollar_value    0.499953
1                           scaled_max_dollar_value    0.136403
2                    average_fraud_prob_of_postcode    0.110235
4  Proportion_between_max_order_value_median_income    0.070937
3                        scaled_stddev_dollar_value    0.070598
5                                             month    0.053135
7    Proportion_between_max_order_value_mean_income    0.051437
6                     transaction_count_last_7_days    0.007302


`Linear regression`

In [None]:

# Get coefficients 
best_lr_model = lr_model.stages[0].bestModel
coefficients = best_lr_model.coefficients
# Get feature names from the VectorAssembler
feature_importances = pd.DataFrame({
    "Feature": feature_names_lr,
    "Coefficient": coefficients
}).sort_values(by="Coefficient", ascending=False)

print(feature_importances)
