# Imputation

This notebook involves the imputation of missing values in consumer and merchant fraud probabilities respectively.

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 

In [2]:
spark = (
    SparkSession.builder.appName("Imputation")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.debug.maxToStringFields", 200)
    .getOrCreate()
)

24/09/10 22:50:42 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/10 22:50:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/10 22:50:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/10 22:50:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/10 22:50:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# Read datasets

In [3]:
# read datasets
consumer_full = spark.read.parquet('../data/curated/consumer_full')
merchant_full_expanded = spark.read.parquet('../data/curated/merchant_full_expanded')
merchant_fraud = spark.read.csv('../data/tables/tables 1/merchant_fraud_probability.csv', header=True, inferSchema=True)

                                                                                

In [4]:
consumer_full.show(5)

+--------+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+-----------------+--------------------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+-----+-----------------+
|postcode|user_id|order_datetime|merchant_abn|      dollar_value|            order_id|consumer_id|total_transaction_count|total_dollar_value|fraud_probability|            SA2_name|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|state|unemployment_rate|
+--------+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+-----------

                                                                                

In [5]:
merchant_full_expanded.show(5)

+------------+------------------+---------+-----------------+---------------+---------------+---------------+---------------+---------------+----------------+---------------+------------+--------------+-------------------+----------------+---------------+---------------+---------------+-----------------+---------------+----------------+-------------------+-------------------+------------+----------------+-----------------+-------------+-------------+-------------+----------------+---------------+-------------+--------------+-----------------+---------------+--------------------+--------------------+--------------+--------------+-----------------+-------------------+--------------+-------------------+--------------+--------------+---------------------+--------------+------------------+---------------+-------------------+-----------------+--------------+---------------+-------------------+-------------+--------------+----------+-------------------+--------------+-----------------------+-

# Consumer fraud probability imputation

# Merchant fraud probability imputation

### Impute NULLs for the merchants already have fraud probability

In [6]:
# calculate the mean of 'fraud_prob' for each unique 'merchant_abn'
merchant_fraud_avg_df = merchant_fraud.groupBy('merchant_abn')\
                                      .agg(avg('fraud_probability')\
                                           .alias('avg_fraud_probability'))
merchant_fraud_avg_df.show(5)


+------------+---------------------+
|merchant_abn|avg_fraud_probability|
+------------+---------------------+
| 99989036621|    18.21089142894488|
| 90568944804|    30.72298492113958|
| 29674997261|    44.43787807900268|
| 27093785141|    28.88064813052203|
| 19492220327|   31.958306675667547|
+------------+---------------------+
only showing top 5 rows



In [7]:
# select only the relevant columns from merchant_fraud_avg_df
merchant_fraud_avg_df = merchant_fraud_avg_df.select('merchant_abn', 'avg_fraud_probability')

# perform a left join between merchant_df2 and merchant_fraud_avg_df on merchant_abn
imputed_merchant = merchant_full_expanded.join(
    merchant_fraud_avg_df,
    on='merchant_abn',
    how='left'
)

# fill null values in fraud_probability with avg_fraud_prob
imputed_merchant= imputed_merchant.withColumn(
    'fraud_probability',
    coalesce(col('fraud_probability'), col('avg_fraud_probability'))
)

# drop the temporary avg_fraud_prob column
imputed_merchant = imputed_merchant.drop('avg_fraud_probability')
imputed_merchant.show(5)

+------------+------------------+---------+-----------------+---------------+---------------+---------------+---------------+---------------+----------------+---------------+------------+--------------+-------------------+----------------+---------------+---------------+---------------+-----------------+---------------+----------------+-------------------+-------------------+------------+----------------+-----------------+-------------+-------------+-------------+----------------+---------------+-------------+--------------+-----------------+---------------+--------------------+--------------------+--------------+--------------+-----------------+-------------------+--------------+-------------------+--------------+--------------+---------------------+--------------+------------------+---------------+-------------------+-----------------+--------------+---------------+-------------------+-------------+--------------+----------+-------------------+--------------+-----------------------+-

In [8]:
imputed_merchant.count()

13614675

In [9]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in imputed_merchant.columns}

# use agg() to calculate null counts for each column
null_counts_df = imputed_merchant.agg(*null_count_dict.values())
null_counts_df.show()



+------------+------------+---------+-----------------+---------------+---------------+---------------+---------------+---------------+----------------+---------------+------------+--------------+-------------------+----------------+---------------+---------------+---------------+-----------------+---------------+----------------+-------------------+-------------------+------------+----------------+-----------------+-------------+-------------+-------------+----------------+---------------+-------------+--------------+-----------------+---------------+--------------------+--------------------+--------------+--------------+-----------------+-------------------+--------------+-------------------+--------------+--------------+---------------------+--------------+------------------+---------------+-------------------+-----------------+--------------+---------------+-------------------+-------------+--------------+----------+-------------------+--------------+-----------------------+-------

                                                                                

This means there are 13,031,129 merchant fraud probability need to be imputed by machine learning models.

### Impute NULLs for the merchants without fraud probability