# Transaction Analysis

This notebook is used to denote fraud for merahants, consumers, and transactions. Then will remove outliers.

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
from pyspark.sql.functions import when, col, count

In [2]:
spark = (
    SparkSession.builder.appName("Imputation")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.debug.maxToStringFields", 3000)
    .config("spark.network.timeout", "300s")
    .config("spark.driver.maxResultSize", "4g")
    .config("spark.rpc.askTimeout", "300s")
    .config("spark.driver.memory", "8G")
    .config("spark.executor.memory", "8G")
    .getOrCreate()
)

24/09/19 22:50:58 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/19 22:50:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/19 22:50:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read datasets

In [3]:
final_imputed_consumer = spark.read.parquet('../data/curated/final_imputed_consumer')
final_imputed_merchant = spark.read.parquet('../data/curated/final_imputed_merchant')

                                                                                

# Denoting fraud for consumer dataset

In [4]:
# create a new column `consumer_is_fraud` based on the fraud_probability threshold of 50
final_imputed_consumer = final_imputed_consumer.withColumn(
    'consumer_is_fraud', 
    when(col('fraud_probability') >= 50, 1).otherwise(0)
)

final_imputed_consumer = final_imputed_consumer.withColumnRenamed('fraud_probability', 
                                                                  'consumer_fraud_probability')
final_imputed_consumer.select('consumer_fraud_probability', 
                              'consumer_is_fraud').show()

                                                                                

+--------------------------+-----------------+
|consumer_fraud_probability|consumer_is_fraud|
+--------------------------+-----------------+
|        11.998913422103769|                0|
|        11.897039565334778|                0|
|        15.238617000084226|                0|
|        11.897039565334778|                0|
|        16.511154537793757|                0|
|        11.897039565334778|                0|
|        16.511154537793757|                0|
|        16.511154537793757|                0|
|        10.825871482844985|                0|
|        10.825871482844985|                0|
|        12.135274548815593|                0|
|         8.848401761937838|                0|
|        12.135274548815593|                0|
|         8.848401761937838|                0|
|        12.604746749130138|                0|
|         17.26103891349819|                0|
|        12.604746749130138|                0|
|        14.647115130997538|                0|
|        12.6

In [5]:
# save as a parquet file
final_imputed_consumer.write.parquet('../data/curated/full_consumer_with_is_fraud', mode='overwrite')
final_imputed_consumer.show(5)

                                                                                

+-------+--------+--------------+-----------+------------------+-----+------------+------------------+--------------------+--------------------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+--------+---------+--------+---------+--------+---------+---------+---------+------------------+-------------+-----------+---------------+-------------------+-----------------+
|user_id|postcode|order_datetime|consumer_id|              name|state|merchant_abn|      dollar_value|            order_id|consumer_fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|state_NT|state_ACT|state_SA|state_TAS|state_WA|state_QLD|state_VIC|state_NSW|gender_Undisclos

# Denoting fraud for merchant dataset

In [6]:
# create a new column 'merchant_is_fraud' based on the fraud_probability threshold of 50
final_imputed_merchant = final_imputed_merchant.withColumn(
    'merchant_is_fraud', 
    when(col('fraud_probability') >= 50, 1).otherwise(0)
)

final_imputed_merchant = final_imputed_merchant.withColumnRenamed('fraud_probability', 
                                                                  'merchant_fraud_probability')
final_imputed_merchant.select('merchant_abn', 'merchant_fraud_probability', 
                              'merchant_is_fraud').show()

+------------+--------------------------+-----------------+
|merchant_abn|merchant_fraud_probability|merchant_is_fraud|
+------------+--------------------------+-----------------+
| 98861937890|         29.28372080466999|                0|
| 23338656015|        41.172592553465414|                0|
| 80324045558|         49.99330766222468|                0|
| 76626119831|         46.97530600317222|                0|
| 46607130088|         30.30653692499646|                0|
| 24015173965|        28.504726875407385|                0|
| 78798828265|        28.503484412491503|                0|
| 24852446429|         61.90818841415942|                1|
| 64203420245|        31.330827619210968|                0|
| 32560075533|         30.38971777629593|                0|
| 90543168331|         66.64232799590874|                1|
| 91338194166|        30.716051747341343|                0|
| 22033359776|         53.62916380315849|                1|
| 70172340121|         70.63456170501259

Drop repeted columns:

In [7]:
final_imputed_merchant = final_imputed_merchant.drop('order_datetime')
final_imputed_merchant = final_imputed_merchant.drop('order_timestamp')
final_imputed_merchant = final_imputed_merchant.drop('dollar_value')

In [8]:
# save as a parquet file
final_imputed_merchant.write.parquet('../data/curated/full_merchant_with_is_fraud', mode='overwrite')
final_imputed_merchant.show(5)

                                                                                

+------------+--------------------+---------+--------------------------+-------------------+------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+---------

# Merge the full consumer and merchant dataset

In [9]:
# use `merchant_abn` and `order_id` as join keys
full_transaction = final_imputed_merchant.alias('merchant') \
                                         .join(final_imputed_consumer.alias('consumer'), 
                                               ['merchant_abn', 'order_id'], 'inner')
full_transaction.show(5)

[Stage 12:>                                                         (0 + 1) / 1]

+------------+--------------------+---------+--------------------------+-------------------+-------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+--------

                                                                                

# Denote fraud for each transaction

Rule: if either `consumer_is_fraud` or `merchant_is_fraud` is 1, then this transaction is determined as fraud (denoted as 1).

In [10]:
# add new column `transaction_is_fraud`
full_transaction = full_transaction.withColumn(
    'transaction_is_fraud',
    when((col('consumer_is_fraud') == 1) | (col('merchant_is_fraud') == 1), 1).otherwise(0)
)

full_transaction.select('consumer_is_fraud', 'merchant_is_fraud', 'transaction_is_fraud').show()

[Stage 17:>                                                         (0 + 1) / 1]

+-----------------+-----------------+--------------------+
|consumer_is_fraud|merchant_is_fraud|transaction_is_fraud|
+-----------------+-----------------+--------------------+
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   1|
|                0|                1|                   

                                                                                

Drop unused columns:

In [11]:
cols_to_remove = [
    'name', 'state'
]
full_transaction = full_transaction.drop(*cols_to_remove)

Check the shape of the dataset:

In [12]:
num_rows = full_transaction.count()
print(f"Number of rows: {num_rows}")

num_columns = len(full_transaction.columns)
print(f"Number of columns: {num_columns}")

                                                                                

Number of rows: 11372745
Number of columns: 127


Confirm there is no nulls:

In [13]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in full_transaction.columns}

# use agg() to calculate null counts for each column
null_counts_df = full_transaction.agg(*null_count_dict.values())
null_counts_df.show()



+------------+--------+---------+--------------------------+-------------------+------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+-------------------+-------

                                                                                

# Outlier detection

In [14]:
full_transaction.show(5)

[Stage 40:>                                                         (0 + 1) / 1]

+------------+--------------------+---------+--------------------------+-------------------+-------------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+--------

                                                                                

### Remove the rows have fraud probabilities greater than 100:

In [15]:
# calculate descriptive statistics for fraud indicators
fraud_stats = full_transaction.select(
    'merchant_fraud_probability', 
    'consumer_fraud_probability'
).describe()

# show the descriptive statistics for fraud indicators
fraud_stats.show()

# filter records where fraud probabilities exceed a threshold (e.g., > 100)
# this identifies outliers where fraud probabilities are abnormally high
prob_outliers = full_transaction.filter(
    (col('merchant_fraud_probability') > 100) |
    (col('consumer_fraud_probability') > 100)
)

# display the count of detected outliers
prob_outliers_count = prob_outliers.count()
print(f"Number of detected outliers: {prob_outliers_count}")

# remove the outliers from the original dataset
# this keeps records where fraud probabilities are within the acceptable range
full_transaction = full_transaction.filter(
    (col('merchant_fraud_probability') <= 100) &
    (col('consumer_fraud_probability') <= 100)
)

                                                                                

+-------+--------------------------+--------------------------+
|summary|merchant_fraud_probability|consumer_fraud_probability|
+-------+--------------------------+--------------------------+
|  count|                  11372745|                  11372745|
|   mean|         41.55225719700489|         15.72477093610061|
| stddev|        16.276589720288985|         8.856604238613759|
|    min|        16.717814356370614|         8.287143531552802|
|    max|        100.23516261557917|          97.6298077657765|
+-------+--------------------------+--------------------------+





Number of detected outliers: 2511


                                                                                

### Remove the rows of `dollar_value` are less than 0.5 and `average_population` is less than 100

In [16]:
# calculate the number of rows where dollar_value is less than 0.5
dollar_value_count = full_transaction.filter(col("dollar_value") < 0.5).count()
avg_population_count = full_transaction.filter(col("average_population") < 100).count()

# print the number of rows where dollar_value is less than 0.5
print(f"Number of rows where dollar_value < 0.5: {dollar_value_count}")
print(f"Number of rows where avg_population < 100: {avg_population_count}")

# remove the rows where dollar_value is less than 0.5 and average_population is less than 100
full_transaction = full_transaction.filter(col("dollar_value") >= 0.5)
full_transaction = full_transaction.filter(col("average_population") >= 100)

                                                                                

Number of rows where dollar_value < 0.5: 45003
Number of rows where avg_population < 100: 20849


### Remove the rows have each of the four features: `avg_tot_fam_inc_weekly`, `avg_num_psns_per_bedroom`, `avg_tot_hhd_inc_weekly`, `avg_household_size` are less than 0

In [17]:
# count the rows where all specified columns are greater than 0
greater_than_zero_count = full_transaction.filter(
    (col("avg_tot_fam_inc_weekly") < 0) &
    (col("avg_num_psns_per_bedroom") < 0) &
    (col("avg_tot_hhd_inc_weekly") < 0) &
    (col("avg_household_size") < 0)
).count()

# count the rows where any of the specified columns are equal to 0
equal_to_zero_count = full_transaction.filter(
    (col("avg_tot_fam_inc_weekly") == 0) |
    (col("avg_num_psns_per_bedroom") == 0) |
    (col("avg_tot_hhd_inc_weekly") == 0) |
    (col("avg_household_size") == 0)
).count()

print(f"Number of rows where all specified columns are greater than 0: {greater_than_zero_count}")
print(f"Number of rows where any of the specified columns are equal to 0: {equal_to_zero_count}")




Number of rows where all specified columns are greater than 0: 0
Number of rows where any of the specified columns are equal to 0: 0


                                                                                

In [18]:
full_transaction = full_transaction.filter(
    (col("avg_tot_fam_inc_weekly") > 0) &
    (col("avg_num_psns_per_bedroom") > 0) &
    (col("avg_tot_hhd_inc_weekly") > 0) &
    (col("avg_household_size") > 0)
)

In [19]:
numerical_outlier_check = full_transaction.select('transaction_revenue', 'BNPL_revenue', 
                                                  'dollar_value', 'average_population', 
                                                  'avg_age_persons', 'avg_mortgage_repay_monthly', 
                                                  'avg_tot_prsnl_inc_weekly', 'avg_rent_weekly', 
                                                  'avg_tot_fam_inc_weekly', 'avg_num_psns_per_bedroom', 
                                                  'avg_tot_hhd_inc_weekly', 'avg_household_size', 
                                                  'avg_unemployment_rate')
numerical_outlier_check.describe().show()



+-------+-------------------+--------------------+------------------+------------------+-----------------+--------------------------+------------------------+------------------+----------------------+------------------------+----------------------+------------------+---------------------+
|summary|transaction_revenue|        BNPL_revenue|      dollar_value|average_population|  avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|   avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+-------+-------------------+--------------------+------------------+------------------+-----------------+--------------------------+------------------------+------------------+----------------------+------------------------+----------------------+------------------+---------------------+
|  count|           11304463|            11304463|          11304463|          11304463|         11304463|                  113044

                                                                                

Check the shape of the dataset:

In [20]:
num_rows = full_transaction.count()
print(f"Number of rows: {num_rows}")

num_columns = len(full_transaction.columns)
print(f"Number of columns: {num_columns}")



Number of rows: 11304463
Number of columns: 127


                                                                                

Confirm there is no nulls:

In [21]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in full_transaction.columns}

# use agg() to calculate null counts for each column
null_counts_df = full_transaction.agg(*null_count_dict.values())
null_counts_df.show()



+------------+--------+---------+--------------------------+-------------------+------------+---------------+---------------+---------------+---------------+---------------+----------------+------------+-------------------+--------------+--------------+---------------+--------------+---------------+---------------+-----------------+---------------+----------------+----------------+-------------------+------------+----------------+------------------+-------------+-------------+---------------+---------------+-------------+--------------+--------------------+--------------+-------------------+-----------------+--------------+-----------------+--------------+-------------------+--------------+---------------+---------------+-------------+--------------+-------------------+--------------+--------------+----------------+-------------+-------------+---------------+---------------+----------------+----------------+---------------+------------------+----------------+-------------------+-------

                                                                                

Count the rows deleted:

In [22]:
count_deleted = 11372745 - 11304463
percentage_deleted = (count_deleted/11372745)*100
print(f"Number of rows deleted:", count_deleted)
print(f"Percentage of rows deleted (%):", percentage_deleted)

Number of rows deleted: 68282
Percentage of rows deleted (%): 0.600400343100984


In [27]:
# save as a parquet file
full_transaction.write.parquet('../data/curated/full_transaction', mode='overwrite')

                                                                                

# Compute the percentage of is_fraud

In [24]:
# for `consumer_is_fraud`
fraud_group = full_transaction.groupBy('consumer_is_fraud').count()
fraud_group = fraud_group.withColumn("percentage", (col("count") / num_rows) * 100)
fraud_group.show()



+-----------------+--------+-----------------+
|consumer_is_fraud|   count|       percentage|
+-----------------+--------+-----------------+
|                1|  188173|1.664590348077569|
|                0|11116290|98.33540965192243|
+-----------------+--------+-----------------+



                                                                                

In [25]:
# for `merchant_is_fraud`
fraud_group = full_transaction.groupBy('merchant_is_fraud').count()
fraud_group = fraud_group.withColumn("percentage", (col("count") / num_rows) * 100)
fraud_group.show()



+-----------------+-------+------------------+
|merchant_is_fraud|  count|        percentage|
+-----------------+-------+------------------+
|                1|3079796|27.244071655593018|
|                0|8224667| 72.75592834440698|
+-----------------+-------+------------------+



                                                                                

In [26]:
# for `transaction_is_fraud`
fraud_group = full_transaction.groupBy('transaction_is_fraud').count()
fraud_group = fraud_group.withColumn("percentage", (col("count") / num_rows) * 100)
fraud_group.show()



+--------------------+-------+-----------------+
|transaction_is_fraud|  count|       percentage|
+--------------------+-------+-----------------+
|                   1|3216170|28.45044474912254|
|                   0|8088293|71.54955525087746|
+--------------------+-------+-----------------+



                                                                                