# Feature Engineering

This notebook is used to merge and aggregate datasets.

---

In [75]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
from pyspark.sql.functions import col, count, when, isnull
import pandas as pd
import geopandas as gpd

In [76]:
spark = (
    SparkSession.builder.appName("Feature_Engineering")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.debug.maxToStringFields", 200)
    .getOrCreate()
)

# Read datasets

In [77]:
consumer_detail  = spark.read.parquet('../data/curated/consumer_detail')
transaction = spark.read.parquet('../data/curated/transaction')
merchant = spark.read.parquet('../data/curated/merchant')
merchant_detail= spark.read.parquet('../data/curated/merchant_detail')
postcode_sa2_geo = gpd.read_file('../data/curated/postcode_sa2_geo.shp')
merged_external = spark.read.parquet('../data/curated/merged_external')
active_consumer_with_transaction  = spark.read.parquet('../data/curated/active_consumer_with_transaction')
transaction = spark.read.parquet('../data/curated/transaction/')

### Convert data types

In [78]:
print('consumer_detail:')
consumer_detail.printSchema()

print('\ntransaction:')
transaction.printSchema()

print('\nmerchant:')
merchant.printSchema()

print('\npostcode_sa2_geo:')
print(postcode_sa2_geo.dtypes)

consumer_detail:
root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)


transaction:
root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)


merchant:
root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: double (nullable = true)
 |-- name: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)
 |-- processed_category: string (nullable = true)


postcode_sa2_geo:
SA2_name        object
SA2_MAINCO       int64
postcode         int64
state           object
geometry      geometry
dtype: object


In [79]:
# convert the data type of postcode column in consumer_transaction dataset 
consumer_detail = consumer_detail.withColumn('postcode', col('postcode').cast('int'))
consumer_detail.printSchema()

root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



### Display dataset

In [80]:
consumer_detail.show(5)

+-----------+-------+-----------------+-----+--------+------+
|consumer_id|user_id|             name|state|postcode|gender|
+-----------+-------+-----------------+-----+--------+------+
|     870353| 213579|    Charles Davis|   SA|    5261|  Male|
|     923963| 213580|Jacqueline Nelson|  QLD|    4744|Female|
|      93016| 213581|    Carolyn Smith|  QLD|    4454|Female|
|      61324| 213582|      Denise Rush|   WA|    6705|Female|
|     823311| 213583|  Nathan Williams|  NSW|    2145|  Male|
+-----------+-------+-----------------+-----+--------+------+
only showing top 5 rows



In [81]:
consumer_detail.count()

499999

In [82]:
merchant_detail.show(5)

+--------------------+------------+-------------+---------+--------------------+
|                name|merchant_abn|revenue_level|take_rate|  processed_category|
+--------------------+------------+-------------+---------+--------------------+
|fusce aliquet lim...| 17189523131|            c|     1.59|writing paper sup...|
|    cum sociis corp.| 22528859307|            a|      6.4|shop jewelry repa...|
|morbi vehicula li...| 32413511882|            b|     3.35|vehicle new part ...|
|vel nisl incorpor...| 32897338221|            a|      6.2|souvenir card sho...|
|    egestas sed inc.| 34082818630|            a|     6.86|digital book musi...|
+--------------------+------------+-------------+---------+--------------------+
only showing top 5 rows



In [83]:
merchant_detail.count()

4026

In [84]:
merchant.show(5)

+------------+--------------+-----------------+--------------------+-------------+---------+--------------------+
|merchant_abn|order_datetime|fraud_probability|                name|revenue_level|take_rate|  processed_category|
+------------+--------------+-----------------+--------------------+-------------+---------+--------------------+
| 17189523131|          NULL|             NULL|fusce aliquet lim...|            c|     1.59|writing paper sup...|
| 22528859307|          NULL|             NULL|    cum sociis corp.|            a|      6.4|shop jewelry repa...|
| 32413511882|          NULL|             NULL|morbi vehicula li...|            b|     3.35|vehicle new part ...|
| 32897338221|          NULL|             NULL|vel nisl incorpor...|            a|      6.2|souvenir card sho...|
| 34082818630|          NULL|             NULL|    egestas sed inc.|            a|     6.86|digital book musi...|
+------------+--------------+-----------------+--------------------+-------------+------

In [85]:
merchant.count()

4073

In [86]:
transaction.show(5)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  14935| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|
|      1| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|
|  14936| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|
|      1| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|
|  14936| 31101120643|25.228114942417797|8e301c0f-06ab-45c...|    2021-11-26|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



In [87]:
transaction.count()

13614675

# Compute total transaction count and value for each merchant

In [88]:
# group by merchant_abn and aggregate the count of orders and total dollar_value
merchant_total_count_dollar = transaction.groupBy("merchant_abn").agg(
    count("order_id").alias("transaction_count"),  # assuming tip count is equivalent to the number of orders
    sum("dollar_value").alias("total_dollar_value")
)

merchant_total_count_dollar.show(5)



+------------+-----------------+------------------+
|merchant_abn|transaction_count|total_dollar_value|
+------------+-----------------+------------------+
| 83412691377|            14288| 498536.5816973135|
| 38700038932|             7132| 9546185.360697312|
| 35344855546|             1522|134737.25046268434|
| 15613631617|             1785| 543030.5313328261|
| 19839532017|              726|          113982.0|
+------------+-----------------+------------------+
only showing top 5 rows



                                                                                

In [89]:
# save the parquet file
merchant_total_count_dollar.write.parquet('../data/curated/merchant_total_count_dollar', mode='overwrite')

                                                                                

# Merge consumer and transaction by `user_id`

In [90]:
consumer_transaction = consumer_detail.join(
    transaction,
    on=['user_id'],
    how='inner'
)

consumer_transaction = consumer_detail.join(
    transaction,
    on=['user_id'],
    how='inner'
)

consumer_transaction.show(5)

+-------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+--------------+
|user_id|consumer_id|            name|state|postcode|gender|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+--------------+
|  14935|    1059280|   Cameron Adams|  QLD|    4563|  Male| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|
|      1|    1195503|Yolanda Williams|   WA|    6935|Female| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|
|  14936|     986886|     Maria Riley|   SA|    5157|Female| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|
|      1|    1195503|Yolanda Williams|   WA|    6935|Female| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|
|  14936|     986886|     Maria Riley|   SA|    5157|Female| 31101120643|25.228114942417797|8e301

In [91]:
consumer_transaction.count()

13614675

In [92]:
# save the parquet file
consumer_transaction.write.parquet('../data/curated/consumer_transaction', mode='overwrite')

                                                                                

# Compute total transaction count and value for each consumer

In [93]:
consumer_transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [94]:
# calculate the total number of transactions and total transaction amount for each consumer
consumer_total_count_dollar = (consumer_transaction
                               .groupBy("consumer_id")
                               .agg(
                                   count("order_id").alias("total_transaction_count"),
                                   sum("dollar_value").alias("total_dollar_value")
                                   ))

extra_columns_df = (consumer_transaction
                    .select("consumer_id", "user_id", "postcode")
                    .distinct())

consumer_total_count_dollar = (consumer_total_count_dollar
                               .join(extra_columns_df, on="consumer_id", how="left"))

consumer_total_count_dollar.show(5)




+-----------+-----------------------+------------------+-------+--------+
|consumer_id|total_transaction_count|total_dollar_value|user_id|postcode|
+-----------+-----------------------+------------------+-------+--------+
|    1144223|                    543| 77840.35704704828|  14971|    4184|
|     283969|                    581|100112.93516413833|  15024|    2228|
|    1050479|                    564|100977.04005104418|  15031|    5502|
|    1174371|                    562| 87222.61085702278|     95|    2352|
|     921339|                    591|134278.40943370626|    152|    2161|
+-----------+-----------------------+------------------+-------+--------+
only showing top 5 rows



                                                                                

In [95]:
# save as a parquet file
consumer_total_count_dollar.write.parquet('../data/curated/consumer_total_count_dollar', mode='overwrite')

                                                                                

# Add geometry information to previous table

In [96]:
consumer_total_count_dollar_df = consumer_total_count_dollar.toPandas()
consumer_total_count_dollar_geo = consumer_total_count_dollar_df.merge(
    postcode_sa2_geo,
    on='postcode', how='inner')
consumer_total_count_dollar_geo = consumer_total_count_dollar_geo.drop('SA2_MAINCO', axis=1)
consumer_total_count_dollar_geo.head()

                                                                                

Unnamed: 0,consumer_id,total_transaction_count,total_dollar_value,user_id,postcode,SA2_name,state,geometry
0,1144223,543,77840.357047,14971,4184,North Stradbroke Island,QLD,"MULTIPOLYGON (((153.3841 -27.51581, 153.38428 ..."
1,1144223,543,77840.357047,14971,4184,Southern Moreton Bay Islands,QLD,"MULTIPOLYGON (((153.31943 -27.68148, 153.31947..."
2,242288,558,87245.973644,20696,4184,North Stradbroke Island,QLD,"MULTIPOLYGON (((153.3841 -27.51581, 153.38428 ..."
3,242288,558,87245.973644,20696,4184,Southern Moreton Bay Islands,QLD,"MULTIPOLYGON (((153.31943 -27.68148, 153.31947..."
4,453040,586,83765.845889,3783,4184,North Stradbroke Island,QLD,"MULTIPOLYGON (((153.3841 -27.51581, 153.38428 ..."


In [97]:
consumer_total_count_dollar_geo.shape

(37159, 8)

In [98]:
consumer_total_count_dollar_bySA2_geo = consumer_total_count_dollar_geo.groupby('SA2_name', as_index=False).agg({
    'total_transaction_count': 'sum',
    'total_dollar_value': 'sum',
    'postcode': 'first',                # keep the first postcode of each SA2_name
    'state': 'first',
    'geometry': 'first'     
})
consumer_total_count_dollar_bySA2_geo.head()

Unnamed: 0,SA2_name,total_transaction_count,total_dollar_value,postcode,state,geometry
0,ACT - South West,13006,2002136.0,2902,ACT,"POLYGON ((148.88381 -35.26411, 148.94988 -35.2..."
1,APY Lands,9080,1400538.0,872,NT,"POLYGON ((129.00186 -26.72252, 129.00186 -26.7..."
2,Abbotsford,3369,527862.2,3067,VIC,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8..."
3,Aberfoyle Park,3959,624726.6,5159,SA,"POLYGON ((138.58963 -35.06584, 138.58993 -35.0..."
4,Acacia Gardens,4526,736920.1,2763,NSW,"POLYGON ((150.91593 -33.72971, 150.91661 -33.7..."


In [99]:
consumer_total_count_dollar_bySA2_geo.shape

(2311, 6)

In [100]:
# save the DataFrame as a CSV file
consumer_total_count_dollar_bySA2_geo.to_csv('../data/curated/consumer_total_count_dollar_bySA2_geo.csv', index=False)

In [101]:
# drop the geometry column
consumer_total_count_dollar_bySA2_df = consumer_total_count_dollar_bySA2_geo.drop(columns=['geometry'])

# convert the DataFrame as a Parquet file
consumer_total_count_dollar_bySA2 = spark.createDataFrame(consumer_total_count_dollar_bySA2_df)

# save the DataFrame as a Parquet file
consumer_total_count_dollar_bySA2.write.parquet('../data/curated/consumer_total_count_dollar_bySA2', mode='overwrite')

# Aggregated the external dataset by postcode

In [102]:
merged_external.show(5)

+--------------------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+--------+-----+-----------------+
|            SA2_name|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|postcode|state|unemployment_rate|
+--------------------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+--------+-----+-----------------+
|Lambton - New Lam...|           17374.0|                40|                         2119|                        907|               390|                     2484|            

In [103]:
# Aggregate by the `postcode` column, taking sum for numerical columns and first for categorical columns
merged_external_by_postcode = merged_external.groupBy("postcode").agg(
    first("SA2_name").alias("SA2_name"),
    sum("average_population").alias("average_population"),
    avg("Median_age_persons").alias("avg_age_persons"),
    avg("Median_mortgage_repay_monthly").alias("avg_mortgage_repay_monthly"),
    avg("Median_tot_prsnl_inc_weekly").alias("avg_tot_prsnl_inc_weekly"),
    avg("Median_rent_weekly").alias("avg_rent_weekly"),
    avg("Median_tot_fam_inc_weekly").alias("avg_tot_fam_inc_weekly"),
    avg("Average_num_psns_per_bedroom").alias("avg_num_psns_per_bedroom"),
    avg("Median_tot_hhd_inc_weekly").alias("avg_tot_hhd_inc_weekly"),
    avg("Average_household_size").alias("avg_household_size"),
    first("state").alias("state"),
    avg("unemployment_rate").alias("avg_unemployment_rate")
)

merged_external_by_postcode.show(5)

+--------+--------------+------------------+------------------+--------------------------+------------------------+------------------+----------------------+------------------------+----------------------+------------------+-----+---------------------+
|postcode|      SA2_name|average_population|   avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|   avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|state|avg_unemployment_rate|
+--------+--------------+------------------+------------------+--------------------------+------------------------+------------------+----------------------+------------------------+----------------------+------------------+-----+---------------------+
|     800|   Darwin City|           15079.0|              33.0|                    1781.0|                  1236.0|             450.0|                2403.0|                     1.0|                2151.0|               2.0|   NT|   3.349999

In [104]:
merged_external_by_postcode.count()

2643

In [105]:
# save as a parquet file
merged_external_by_postcode.write.parquet('../data/curated/merged_external_by_postcode', mode='overwrite')

# Combine consumer details with all external datasets

In [106]:
consumer_transaction.count()

13614675

In [107]:
# extract the fraud probability for the consumers who have transaction records
consumer_fraud = active_consumer_with_transaction.select(['user_id', 'order_datetime', 
                                                          'fraud_probability'])
consumer_fraud.show(5)

+-------+--------------+------------------+
|user_id|order_datetime| fraud_probability|
+-------+--------------+------------------+
|      2|    2021-08-30| 9.599513915425788|
|      4|    2021-10-09| 9.633302411090419|
|      5|    2021-10-04|10.868364868449886|
|      5|    2022-02-08|  9.02022421158597|
|      8|          NULL|              NULL|
+-------+--------------+------------------+
only showing top 5 rows



In [108]:
consumer_fraud.select('user_id').distinct().count()

24081

In [109]:
# merge consumer fraud probabillity with full consumer transaction records
consumer_df = consumer_transaction.join(
    consumer_fraud,
    on=['user_id', 'order_datetime'],
    how='left'
)
consumer_df.show(5)

+-------+--------------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+-----------------+
|user_id|order_datetime|consumer_id|            name|state|postcode|gender|merchant_abn|      dollar_value|            order_id|fraud_probability|
+-------+--------------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+-----------------+
|  14935|    2021-11-26|    1059280|   Cameron Adams|  QLD|    4563|  Male| 79417999332|136.06570809815838|23acbb7b-cf98-458...|             NULL|
|      1|    2021-11-26|    1195503|Yolanda Williams|   WA|    6935|Female| 46451548968| 72.61581642788431|76bab304-fa2d-400...|             NULL|
|  14936|    2021-11-26|     986886|     Maria Riley|   SA|    5157|Female| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|             NULL|
|      1|    2021-11-26|    1195503|Yolanda Williams|   WA|    6935|Female| 49167531725| 51.58228625503599|7080c274-17

In [110]:
consumer_df.count()

                                                                                

13614675

In [111]:
consumer_df.select('user_id').distinct().count()

                                                                                

24081

In [112]:
# count the number of missing values ​​in the fraud_probability column
missing_fraud_prob_count = consumer_df.filter(col('fraud_probability').isNull()).count()
print(f"Number of missing fraud_probability values: {missing_fraud_prob_count}")
print(f"Percentage of missing fraud_probability values: {missing_fraud_prob_count/13614675}")

Number of missing fraud_probability values: 13543038
Percentage of missing fraud_probability values: 0.9947382511885153


                                                                                

In [113]:
# merge full consumer transactions records with all external datasets
consumer_full = consumer_df.join(
    merged_external_by_postcode.drop('state'),
    on="postcode",
    how="left"
)
consumer_full = consumer_full.drop('SA2_name')
consumer_full.show(5)

+--------+-------+--------------+-----------+----------------+-----+------+------------+------------------+--------------------+-----------------+------------------+------------------+--------------------------+------------------------+-----------------+----------------------+------------------------+----------------------+------------------+---------------------+
|postcode|user_id|order_datetime|consumer_id|            name|state|gender|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|   avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|  avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+--------+-------+--------------+-----------+----------------+-----+------+------------+------------------+--------------------+-----------------+------------------+------------------+--------------------------+------------------------+-----------------+------------

In [114]:
consumer_full.count()

                                                                                

13614675

In [115]:
# count the number of missing values ​​in each column
missing_values_df = consumer_full.select([
    count(when(col(c).isNull(), c)).alias(c) for c in consumer_full.columns
])
missing_values_df.show()

                                                                                

+--------+-------+--------------+-----------+----+-----+------+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+
|postcode|user_id|order_datetime|consumer_id|name|state|gender|merchant_abn|dollar_value|order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+--------+-------+--------------+-----------+----+-----+------+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+
|   

The above result means there are 2,241,930 records that do not have related external dataset information, so we drop them.

Find the rows need to be dropped:

In [116]:
# list of columns to check for NULLs
columns_to_check = [
    'average_population',
    'avg_age_persons',
    'avg_mortgage_repay_monthly',
    'avg_tot_prsnl_inc_weekly',
    'avg_rent_weekly',
    'avg_tot_fam_inc_weekly',
    'avg_num_psns_per_bedroom',
    'avg_tot_hhd_inc_weekly',
    'avg_household_size',
    'avg_unemployment_rate'
]

# create a condition for rows where all specified columns are NULL
condition = col(columns_to_check[0]).isNull()
for c in columns_to_check[1:]:
    condition = condition & col(c).isNull()

# filter the DataFrame to show these rows
consumer_full_dropped = consumer_full.filter(condition)
consumer_full_dropped.show(5)

+--------+-------+--------------+-----------+----------------+-----+------+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+
|postcode|user_id|order_datetime|consumer_id|            name|state|gender|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+--------+-------+--------------+-----------+----------------+-----+------+------------+------------------+--------------------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+----

In [117]:
consumer_full_dropped.count()

                                                                                

2241930

The final consumer table (with consumer transaction records and details, and external datasets):

In [118]:
# drop all Null
consumer_full_cleaned = consumer_full.dropna(subset=columns_to_check)
consumer_full_cleaned.show(5)

+--------+-------+--------------+-----------+-------------+-----+------+------------+------------------+--------------------+-----------------+------------------+------------------+--------------------------+------------------------+-----------------+----------------------+------------------------+----------------------+------------------+---------------------+
|postcode|user_id|order_datetime|consumer_id|         name|state|gender|merchant_abn|      dollar_value|            order_id|fraud_probability|average_population|   avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|  avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+--------+-------+--------------+-----------+-------------+-----+------+------------+------------------+--------------------+-----------------+------------------+------------------+--------------------------+------------------------+-----------------+---------------------

In [119]:
consumer_full_cleaned.count()

                                                                                

11372745

In [120]:
print(f'Number of rows deleted:', 13614675-11372745)
print(f'Percentage of rows deleted:', (13614675-11372745)/13614675)

Number of rows deleted: 2241930
Percentage of rows deleted: 0.16467010780646618


In [121]:
# find the maximum and minimum values ​​of the order_datetime column
consumer_full_cleaned.select(min("order_datetime").alias("min_date"), max("order_datetime").alias("max_date")).show()

+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2021-02-28|2022-10-26|
+----------+----------+



                                                                                

Compute the number of missing values again, this is the number of missing values for consumer fraud probability:

In [122]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in consumer_full_cleaned.columns}

# use agg() to calculate null counts for each column
null_counts_df = consumer_full_cleaned.agg(*null_count_dict.values())
null_counts_df.show()



+--------+-------+--------------+-----------+----+-----+------+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+
|postcode|user_id|order_datetime|consumer_id|name|state|gender|merchant_abn|dollar_value|order_id|fraud_probability|average_population|avg_age_persons|avg_mortgage_repay_monthly|avg_tot_prsnl_inc_weekly|avg_rent_weekly|avg_tot_fam_inc_weekly|avg_num_psns_per_bedroom|avg_tot_hhd_inc_weekly|avg_household_size|avg_unemployment_rate|
+--------+-------+--------------+-----------+----+-----+------+------------+------------+--------+-----------------+------------------+---------------+--------------------------+------------------------+---------------+----------------------+------------------------+----------------------+------------------+---------------------+
|   

                                                                                

In [123]:
# save as a parquet file
consumer_full_cleaned.write.parquet('../data/curated/consumer_full', mode='overwrite')

                                                                                

# Combine merchant details with their fraud probability

In [124]:
merchant_df = transaction.join(
    merchant_detail,
    on=['merchant_abn'],
    how='inner'
)
merchant_df.show(5)

+------------+-------+------------------+--------------------+--------------+--------------------+-------------+---------+--------------------+
|merchant_abn|user_id|      dollar_value|            order_id|order_datetime|                name|revenue_level|take_rate|  processed_category|
+------------+-------+------------------+--------------------+--------------+--------------------+-------------+---------+--------------------+
| 79417999332|  14935|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|phasellus at company|            b|     4.95|souvenir card sho...|
| 46451548968|      1| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|tempus eu ligula ...|            a|     6.04|and beauty health...|
| 89518629617|  14936|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|vulputate velit e...|            c|     3.09|and shop awning tent|
| 49167531725|      1| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|     felis institute|            a|     6.42|digital book m

In [125]:
merchant_df.count()

13614675

In [126]:
merchant_select = merchant.select(['merchant_abn', 'order_datetime', 'fraud_probability'])
merchant_full= merchant_df.join(
    merchant_select,
    on=['merchant_abn', 'order_datetime'],
    how='left'
)
merchant_full.show(5)

+------------+--------------+-------+------------------+--------------------+--------------------+-------------+---------+--------------------+-----------------+
|merchant_abn|order_datetime|user_id|      dollar_value|            order_id|                name|revenue_level|take_rate|  processed_category|fraud_probability|
+------------+--------------+-------+------------------+--------------------+--------------------+-------------+---------+--------------------+-----------------+
| 79417999332|    2021-11-26|  14935|136.06570809815838|23acbb7b-cf98-458...|phasellus at company|            b|     4.95|souvenir card sho...|             NULL|
| 46451548968|    2021-11-26|      1| 72.61581642788431|76bab304-fa2d-400...|tempus eu ligula ...|            a|     6.04|and beauty health...|             NULL|
| 89518629617|    2021-11-26|  14936|3.0783487174439297|a2ae446a-2959-41c...|vulputate velit e...|            c|     3.09|and shop awning tent|             NULL|
| 49167531725|    2021-11-26

In [127]:
merchant_full.count()

                                                                                

13614675

Drop the rows which are removed in `consumer_full`:

In [128]:
# extract order_id from consumer_full_dropped as a DataFrame
order_ids_df = consumer_full_dropped.select('order_id').distinct()

# filter merchant_full by performing a left anti join
merchant_filtered = merchant_full.join(
    order_ids_df,
    on='order_id',
    how='left_anti'
)
merchant_filtered.count()

                                                                                

11372745

In [129]:
# find the maximum and minimum values ​​of the order_datetime column
merchant_filtered.select(min("order_datetime").alias("min_date"), max("order_datetime").alias("max_date")).show()



+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2021-02-28|2022-10-26|
+----------+----------+



                                                                                

Compute the revenue for each transaction:

In [130]:
merchant_filtered = merchant_filtered.withColumn(
    "transaction_revenue", 
    col("dollar_value") * (100 - col("take_rate")) / 100
)
merchant_filtered.show(5)



+--------------------+------------+--------------+-------+-----------------+--------------------+-------------+---------+--------------------+-----------------+-------------------+
|            order_id|merchant_abn|order_datetime|user_id|     dollar_value|                name|revenue_level|take_rate|  processed_category|fraud_probability|transaction_revenue|
+--------------------+------------+--------------+-------+-----------------+--------------------+-------------+---------+--------------------+-----------------+-------------------+
|0000141d-a22c-4fc...| 49322182190|    2021-10-11|  22405|77.78061012666785|gravida mauris in...|            a|     6.35|shop jewelry repa...|             NULL|  72.84154257046237|
|0000157a-5ae1-4dd...| 11121775571|    2021-11-09|  17063|95.72482420618412|egestas nunc asso...|            a|     6.58|digital book musi...|             NULL|  89.42612902064333|
|0000288a-40dc-4d7...| 15061716544|    2021-06-25|   6681|85.93800442618074|tempus scelerisqu..

                                                                                

Compute the revenue for BNLP:

In [131]:
merchant_filtered = merchant_filtered.withColumn(
    "BNPL_revenue", 
    col("dollar_value") * col("take_rate")/100
)
merchant_filtered.show(5)



+--------------------+------------+--------------+-------+-----------------+--------------------+-------------+---------+--------------------+-----------------+-------------------+------------------+
|            order_id|merchant_abn|order_datetime|user_id|     dollar_value|                name|revenue_level|take_rate|  processed_category|fraud_probability|transaction_revenue|      BNPL_revenue|
+--------------------+------------+--------------+-------+-----------------+--------------------+-------------+---------+--------------------+-----------------+-------------------+------------------+
|0000141d-a22c-4fc...| 49322182190|    2021-10-11|  22405|77.78061012666785|gravida mauris in...|            a|     6.35|shop jewelry repa...|             NULL|  72.84154257046237| 4.939068668866039|
|0000157a-5ae1-4dd...| 11121775571|    2021-11-09|  17063|95.72482420618412|egestas nunc asso...|            a|     6.58|digital book musi...|             NULL|  89.42612902064333| 6.298693359734671|


                                                                                

Compute the number of missing values, this is the number of missing values for merchant fraud probability:

In [132]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in merchant_filtered.columns}

# use agg() to calculate null counts for each column
null_counts_df = merchant_filtered.agg(*null_count_dict.values())
null_counts_df.show()



+--------+------------+--------------+-------+------------+----+-------------+---------+------------------+-----------------+-------------------+------------+
|order_id|merchant_abn|order_datetime|user_id|dollar_value|name|revenue_level|take_rate|processed_category|fraud_probability|transaction_revenue|BNPL_revenue|
+--------+------------+--------------+-------+------------+----+-------------+---------+------------------+-----------------+-------------------+------------+
|       0|           0|             0|      0|           0|   0|            0|        0|                 0|         11369426|                  0|           0|
+--------+------------+--------------+-------+------------+----+-------------+---------+------------------+-----------------+-------------------+------------+



                                                                                

In [133]:
# save as a parquet file
merchant_filtered.write.parquet('../data/curated/merchant_full', mode='overwrite')

                                                                                