# Feature Engineering

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
from pyspark.sql.functions import col, count, when, isnull
import pandas as pd
import geopandas as gpd

In [2]:
spark = (
    SparkSession.builder.appName("Feature_Engineering")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

24/09/10 22:33:49 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/10 22:33:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/10 22:33:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/10 22:33:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/10 22:33:50 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/09/10 22:33:50 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


# Read datasets

In [3]:
consumer_detail  = spark.read.parquet('../data/curated/consumer_detail')
transaction = spark.read.parquet('../data/curated/transaction')
merchant = spark.read.parquet('../data/curated/merchant')
merchant_detail= spark.read.parquet('../data/curated/merchant_detail')
postcode_sa2_geo = gpd.read_file('../data/curated/postcode_sa2_geo.shp')
merged_external = spark.read.parquet('../data/curated/merged_external')
active_consumer_with_transaction  = spark.read.parquet('../data/curated/active_consumer_with_transaction')
transaction = spark.read.parquet('../data/curated/transaction/')
consumer_total_count_dollar = spark.read.parquet('../data/curated/consumer_total_count_dollar/')

                                                                                

### Convert data types

In [4]:
print('consumer_detail:')
consumer_detail.printSchema()

print('\ntransaction:')
transaction.printSchema()

print('\nmerchant:')
merchant.printSchema()

print('\npostcode_sa2_geo:')
print(postcode_sa2_geo.dtypes)

consumer_detail:
root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- gender: string (nullable = true)


transaction:
root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)


merchant:
root
 |-- merchant_abn: long (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- fraud_probability: double (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: float (nullable = true)


postcode_sa2_geo:
SA2_name        object
SA2_MAINCO       int64
postcode         int64
state           object
geometry      geometry
dtype: object


In [5]:
# convert the data type of postcode column in consumer_transaction dataset 
consumer_detail = consumer_detail.withColumn('postcode', col('postcode').cast('int'))
consumer_detail.printSchema()

root
 |-- consumer_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)



### Display dataset

In [6]:
consumer_detail.show(5)

                                                                                

+-----------+-------+-----------------+-----+--------+------+
|consumer_id|user_id|             name|state|postcode|gender|
+-----------+-------+-----------------+-----+--------+------+
|     870353| 213579|    Charles Davis|   SA|    5261|  Male|
|     923963| 213580|Jacqueline Nelson|  QLD|    4744|Female|
|      93016| 213581|    Carolyn Smith|  QLD|    4454|Female|
|      61324| 213582|      Denise Rush|   WA|    6705|Female|
|     823311| 213583|  Nathan Williams|  NSW|    2145|  Male|
+-----------+-------+-----------------+-----+--------+------+
only showing top 5 rows



In [7]:
consumer_detail.count()

499999

In [8]:
merchant_detail.show(5)

+--------------------+------------+--------------------+-------------+---------+
|                name|merchant_abn|            category|revenue_level|take_rate|
+--------------------+------------+--------------------+-------------+---------+
|       felis limited| 10023283211|furniture, home f...|            e|     0.18|
|arcu ac orci corp...| 10142254217|cable, satellite,...|            b|     4.22|
|    nunc sed company| 10165489824|jewelry, watch, c...|            b|      4.4|
|ultricies digniss...| 10187291046|watch, clock, and...|            b|     3.29|
| enim condimentum pc| 10192359162|music shops - mus...|            a|     6.33|
+--------------------+------------+--------------------+-------------+---------+
only showing top 5 rows



In [9]:
merchant_detail.count()

4026

In [10]:
merchant.show(5)

+------------+--------------+-----------------+--------------------+--------------------+-------------+---------+
|merchant_abn|order_datetime|fraud_probability|                name|            category|revenue_level|take_rate|
+------------+--------------+-----------------+--------------------+--------------------+-------------+---------+
| 10023283211|          NULL|             NULL|       felis limited|furniture, home f...|            e|     0.18|
| 10142254217|          NULL|             NULL|arcu ac orci corp...|cable, satellite,...|            b|     4.22|
| 10165489824|          NULL|             NULL|    nunc sed company|jewelry, watch, c...|            b|      4.4|
| 10187291046|          NULL|             NULL|ultricies digniss...|watch, clock, and...|            b|     3.29|
| 10192359162|          NULL|             NULL| enim condimentum pc|music shops - mus...|            a|     6.33|
+------------+--------------+-----------------+--------------------+--------------------

In [11]:
merchant.count()

4073

In [12]:
transaction.show(5)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  14935| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|
|      1| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|
|  14936| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|
|      1| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|
|  14936| 31101120643|25.228114942417797|8e301c0f-06ab-45c...|    2021-11-26|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



In [13]:
transaction.count()

13614675

# Compute total transaction count and value for each merchant

In [14]:
# group by merchant_abn and aggregate the count of orders and total dollar_value
transaction_detail = transaction.groupBy("merchant_abn").agg(
    count("order_id").alias("transaction_count"),  # assuming tip count is equivalent to the number of orders
    sum("dollar_value").alias("total_dollar_value")
)

transaction_detail.show(5)



+------------+-----------------+------------------+
|merchant_abn|transaction_count|total_dollar_value|
+------------+-----------------+------------------+
| 83412691377|            14288| 498536.5816973135|
| 38700038932|             7132| 9546185.360697312|
| 35344855546|             1522|134737.25046268434|
| 15613631617|             1785| 543030.5313328261|
| 19839532017|              726|          113982.0|
+------------+-----------------+------------------+
only showing top 5 rows



                                                                                

In [15]:
# save the parquet file
transaction_detail.write.parquet('../data/curated/transactions_count_dollar', mode='overwrite')

                                                                                

# Merge consumer and transaction by `user_id`

In [16]:
consumer_transaction = consumer_detail.join(
    transaction,
    on=['user_id'],
    how='inner'
)

consumer_transaction = consumer_detail.join(
    transaction,
    on=['user_id'],
    how='inner'
)

consumer_transaction.show(5)

+-------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+--------------+
|user_id|consumer_id|            name|state|postcode|gender|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+-----------+----------------+-----+--------+------+------------+------------------+--------------------+--------------+
|  14935|    1059280|   Cameron Adams|  QLD|    4563|  Male| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|
|      1|    1195503|Yolanda Williams|   WA|    6935|Female| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|
|  14936|     986886|     Maria Riley|   SA|    5157|Female| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|
|      1|    1195503|Yolanda Williams|   WA|    6935|Female| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|
|  14936|     986886|     Maria Riley|   SA|    5157|Female| 31101120643|25.228114942417797|8e301

In [17]:
consumer_transaction.count()

13614675

In [18]:
# save the parquet file
consumer_transaction.write.parquet('../data/curated/consumer_transaction', mode='overwrite')

                                                                                

# Compute total transaction count and value for each consumer

In [19]:
consumer_transaction.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)



In [20]:
# calculate the total number of transactions and total transaction amount for each consumer
consumer_total_count_dollar = (consumer_transaction
                               .groupBy("consumer_id")
                               .agg(
                                   count("order_id").alias("total_transaction_count"),
                                   sum("dollar_value").alias("total_dollar_value")
                                   ))

extra_columns_df = (consumer_transaction
                    .select("consumer_id", "user_id", "postcode")
                    .distinct())

consumer_total_count_dollar = (consumer_total_count_dollar
                               .join(extra_columns_df, on="consumer_id", how="left"))

consumer_total_count_dollar.show(5)


                                                                                

+-----------+-----------------------+------------------+-------+--------+
|consumer_id|total_transaction_count|total_dollar_value|user_id|postcode|
+-----------+-----------------------+------------------+-------+--------+
|    1144223|                    543| 77840.35704704828|  14971|    4184|
|     283969|                    581|100112.93516413833|  15024|    2228|
|    1050479|                    564|100977.04005104418|  15031|    5502|
|    1174371|                    562| 87222.61085702278|     95|    2352|
|     921339|                    591|134278.40943370626|    152|    2161|
+-----------+-----------------------+------------------+-------+--------+
only showing top 5 rows



In [21]:
# save as a parquet file
consumer_total_count_dollar.write.parquet('../data/curated/consumer_total_count_dollar', mode='overwrite')

                                                                                

# Add geometry information to previous table

In [22]:
consumer_total_count_dollar_df = consumer_total_count_dollar.toPandas()
consumer_total_count_dollar_geo = consumer_total_count_dollar_df.merge(
    postcode_sa2_geo,
    on='postcode', how='inner')
consumer_total_count_dollar_geo = consumer_total_count_dollar_geo.drop('SA2_MAINCO', axis=1)
consumer_total_count_dollar_geo.head()

                                                                                

Unnamed: 0,consumer_id,total_transaction_count,total_dollar_value,user_id,postcode,SA2_name,state,geometry
0,1144223,543,77840.357047,14971,4184,North Stradbroke Island,QLD,"MULTIPOLYGON (((153.3841 -27.51581, 153.38428 ..."
1,1144223,543,77840.357047,14971,4184,Southern Moreton Bay Islands,QLD,"MULTIPOLYGON (((153.31943 -27.68148, 153.31947..."
2,242288,558,87245.973644,20696,4184,North Stradbroke Island,QLD,"MULTIPOLYGON (((153.3841 -27.51581, 153.38428 ..."
3,242288,558,87245.973644,20696,4184,Southern Moreton Bay Islands,QLD,"MULTIPOLYGON (((153.31943 -27.68148, 153.31947..."
4,453040,586,83765.845889,3783,4184,North Stradbroke Island,QLD,"MULTIPOLYGON (((153.3841 -27.51581, 153.38428 ..."


In [23]:
consumer_total_count_dollar_geo.shape

(37159, 8)

In [24]:
consumer_total_count_dollar_bySA2_geo = consumer_total_count_dollar_geo.groupby('SA2_name', as_index=False).agg({
    'total_transaction_count': 'sum',
    'total_dollar_value': 'sum',
    'postcode': 'first',                # keep the first postcode of each SA2_name
    'state': 'first',
    'geometry': 'first'     
})
consumer_total_count_dollar_bySA2_geo.head()

Unnamed: 0,SA2_name,total_transaction_count,total_dollar_value,postcode,state,geometry
0,ACT - South West,13006,2002136.0,2902,ACT,"POLYGON ((148.88381 -35.26411, 148.94988 -35.2..."
1,APY Lands,9080,1400538.0,872,NT,"POLYGON ((129.00186 -26.72252, 129.00186 -26.7..."
2,Abbotsford,3369,527862.2,3067,VIC,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8..."
3,Aberfoyle Park,3959,624726.6,5159,SA,"POLYGON ((138.58963 -35.06584, 138.58993 -35.0..."
4,Acacia Gardens,4526,736920.1,2763,NSW,"POLYGON ((150.91593 -33.72971, 150.91661 -33.7..."


In [25]:
consumer_total_count_dollar_bySA2_geo.shape

(2311, 6)

In [26]:
# save the DataFrame as a CSV file
consumer_total_count_dollar_bySA2_geo.to_csv('../data/curated/consumer_total_count_dollar_bySA2_geo.csv', index=False)

In [27]:
# drop the geometry column
consumer_total_count_dollar_bySA2_df = consumer_total_count_dollar_bySA2_geo.drop(columns=['geometry'])

# convert the DataFrame as a Parquet file
consumer_total_count_dollar_bySA2 = spark.createDataFrame(consumer_total_count_dollar_bySA2_df)

# save the DataFrame as a Parquet file
consumer_total_count_dollar_bySA2.write.parquet('../data/curated/consumer_total_count_dollar_bySA2', mode='overwrite')

                                                                                

# Combine consumer details with all external datasets

In [28]:
# merge consumer detail with their transaction records
consumer_df = transaction.join(
    consumer_total_count_dollar,
    on=['user_id'],
    how='inner'
)
consumer_df.show(5)

                                                                                

+-------+------------+------------------+--------------------+--------------+-----------+-----------------------+------------------+--------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|consumer_id|total_transaction_count|total_dollar_value|postcode|
+-------+------------+------------------+--------------------+--------------+-----------+-----------------------+------------------+--------+
|     26| 43186523025| 48.78591885045311|38a3e0bd-d40d-494...|    2021-11-26|    1421465|                    563| 83854.12041701685|    3469|
|     26| 45629217853|171.26544320591705|cb90fda0-bd8d-4e1...|    2021-11-26|    1421465|                    563| 83854.12041701685|    3469|
|     29| 96880556465|10.398714717204657|2ff31f41-242a-43d...|    2021-11-26|    1134079|                    579|103585.52165326523|    4157|
|  15057| 82065156333| 35.74275633819245|cdd177be-8ace-4b4...|    2021-11-26|      66483|                    567| 97979.05980279764|    4613|
|  151

In [29]:
consumer_df.count()

                                                                                

13614675

In [30]:
# extract the fraud probability for the consumers who have transaction records
consumer_fraud = active_consumer_with_transaction.select(['user_id', 'order_datetime', 
                                                          'fraud_probability'])
consumer_fraud.show(5)

+-------+--------------+------------------+
|user_id|order_datetime| fraud_probability|
+-------+--------------+------------------+
|      1|    2022-02-20| 9.805431136520959|
|      2|    2021-08-30| 9.599513915425788|
|      2|    2021-09-25|10.069850934775245|
|      3|    2021-11-03| 8.300636455314633|
|      4|    2021-10-09| 9.633302411090419|
+-------+--------------+------------------+
only showing top 5 rows



In [31]:
# merge consumer fraud probabillity with full consumer transaction records
consumer_df2 = consumer_df.join(
    consumer_fraud,
    on=['user_id', 'order_datetime'],
    how='left'
)
consumer_df2.show(5)

                                                                                

+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+--------+-----------------+
|user_id|order_datetime|merchant_abn|      dollar_value|            order_id|consumer_id|total_transaction_count|total_dollar_value|postcode|fraud_probability|
+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+--------+-----------------+
|     26|    2021-11-26| 43186523025| 48.78591885045311|38a3e0bd-d40d-494...|    1421465|                    563| 83854.12041701685|    3469|             NULL|
|     26|    2021-11-26| 45629217853|171.26544320591705|cb90fda0-bd8d-4e1...|    1421465|                    563| 83854.12041701685|    3469|             NULL|
|     29|    2021-11-26| 96880556465|10.398714717204657|2ff31f41-242a-43d...|    1134079|                    579|103585.52165326523|    4157|             NULL|
|  15057|    2021-11-26| 82065156333| 35

In [32]:
consumer_df2.count()

                                                                                

13614854

In [33]:
# count the number of missing values ​​in the fraud_probability column
missing_fraud_prob_count = consumer_df2.filter(col('fraud_probability').isNull()).count()
print(f"Number of missing fraud_probability values: {missing_fraud_prob_count}")
print(f"Percentage of missing fraud_probability values: {missing_fraud_prob_count/13614854}")

                                                                                

Number of missing fraud_probability values: 13543038
Percentage of missing fraud_probability values: 0.9947251729618254


In [34]:
# merge full consumer transactions records with all external datasets
consumer_full = consumer_df2.join(
    merged_external,
    on="postcode",
    how="left"
)
consumer_full.show(5)

                                                                                

+--------+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+-----------------+--------------------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+-----+-----------------+
|postcode|user_id|order_datetime|merchant_abn|      dollar_value|            order_id|consumer_id|total_transaction_count|total_dollar_value|fraud_probability|            SA2_name|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|state|unemployment_rate|
+--------+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+-----------

In [35]:
consumer_full.count()

                                                                                

23819790

In [36]:
# count the number of missing values ​​in each column
missing_values_df = consumer_full.select([
    count(when(col(c).isNull(), c)).alias(c) for c in consumer_full.columns
])
missing_values_df.show()



+--------+-------+--------------+------------+------------+--------+-----------+-----------------------+------------------+-----------------+--------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+-------+-----------------+
|postcode|user_id|order_datetime|merchant_abn|dollar_value|order_id|consumer_id|total_transaction_count|total_dollar_value|fraud_probability|SA2_name|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|  state|unemployment_rate|
+--------+-------+--------------+------------+------------+--------+-----------+-----------------------+------------------+-----------------+--------+------------------+------------------+--------------------

                                                                                

The final consumer table (with consumer transaction records and details, and external datasets):

In [37]:
# drop the rows with NULLs
columns_to_check = [
    'SA2_name',
    'average_population',
    'Median_age_persons',
    'Median_mortgage_repay_monthly',
    'Median_tot_prsnl_inc_weekly',
    'Median_rent_weekly',
    'Median_tot_fam_inc_weekly',
    'Average_num_psns_per_bedroom',
    'Median_tot_hhd_inc_weekly',
    'Average_household_size',
    'state',
    'unemployment_rate'
]

consumer_full_cleaned = consumer_full.dropna(subset=columns_to_check)
consumer_full_cleaned.show(5)

                                                                                

+--------+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+-----------------+--------------------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+-----+-----------------+
|postcode|user_id|order_datetime|merchant_abn|      dollar_value|            order_id|consumer_id|total_transaction_count|total_dollar_value|fraud_probability|            SA2_name|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|state|unemployment_rate|
+--------+-------+--------------+------------+------------------+--------------------+-----------+-----------------------+------------------+-----------

In [38]:
consumer_full_cleaned.count()

                                                                                

21577841

In [39]:
print(f'Number of rows deleted:', 23819790-21577841)
print(f'Percentage of rows deleted:', (23819790-21577841)/23819790)

Number of rows deleted: 2241949
Percentage of rows deleted: 0.0941212747887366


In [48]:
# find the maximum and minimum values ​​of the order_datetime column
consumer_full_cleaned.select(min("order_datetime").alias("min_date"), max("order_datetime").alias("max_date")).show()



+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2021-02-28|2022-10-26|
+----------+----------+



                                                                                

Compute the number of missing values again, this is the number of missing values for consumer fraud probability:

In [40]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in consumer_full_cleaned.columns}

# use agg() to calculate null counts for each column
null_counts_df = consumer_full_cleaned.agg(*null_count_dict.values())
null_counts_df.show()



+--------+-------+--------------+------------+------------+--------+-----------+-----------------------+------------------+-----------------+--------+------------------+------------------+-----------------------------+---------------------------+------------------+-------------------------+----------------------------+-------------------------+----------------------+-----+-----------------+
|postcode|user_id|order_datetime|merchant_abn|dollar_value|order_id|consumer_id|total_transaction_count|total_dollar_value|fraud_probability|SA2_name|average_population|Median_age_persons|Median_mortgage_repay_monthly|Median_tot_prsnl_inc_weekly|Median_rent_weekly|Median_tot_fam_inc_weekly|Average_num_psns_per_bedroom|Median_tot_hhd_inc_weekly|Average_household_size|state|unemployment_rate|
+--------+-------+--------------+------------+------------+--------+-----------+-----------------------+------------------+-----------------+--------+------------------+------------------+------------------------

                                                                                

In [41]:
# save as a parquet file
consumer_full_cleaned.write.parquet('../data/curated/consumer_full', mode='overwrite')

                                                                                

# Combine merchant details with their fraud probability

In [42]:
merchant_df= transaction.join(
    merchant_detail,
    on=['merchant_abn'],
    how='inner'
)
merchant_df.show(5)

+------------+-------+------------------+--------------------+--------------+--------------------+--------------------+-------------+---------+
|merchant_abn|user_id|      dollar_value|            order_id|order_datetime|                name|            category|revenue_level|take_rate|
+------------+-------+------------------+--------------------+--------------+--------------------+--------------------+-------------+---------+
| 79417999332|  14935|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|phasellus at company|gift, card, novel...|            b|     4.95|
| 46451548968|      1| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|tempus eu ligula ...|health and beauty...|            a|     6.04|
| 89518629617|  14936|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|vulputate velit e...|tent  and awning ...|            c|     3.09|
| 49167531725|      1| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|     felis institute|digital goods: bo...|            a|   

In [43]:
merchant_df.count()

13614675

In [44]:
merchant_select = merchant.select(['merchant_abn', 'order_datetime', 'fraud_probability'])
merchant_full= merchant_df.join(
    merchant_select,
    on=['merchant_abn', 'order_datetime'],
    how='left'
)
merchant_full.show(5)

+------------+--------------+-------+------------------+--------------------+--------------------+--------------------+-------------+---------+-----------------+
|merchant_abn|order_datetime|user_id|      dollar_value|            order_id|                name|            category|revenue_level|take_rate|fraud_probability|
+------------+--------------+-------+------------------+--------------------+--------------------+--------------------+-------------+---------+-----------------+
| 79417999332|    2021-11-26|  14935|136.06570809815838|23acbb7b-cf98-458...|phasellus at company|gift, card, novel...|            b|     4.95|             NULL|
| 46451548968|    2021-11-26|      1| 72.61581642788431|76bab304-fa2d-400...|tempus eu ligula ...|health and beauty...|            a|     6.04|             NULL|
| 89518629617|    2021-11-26|  14936|3.0783487174439297|a2ae446a-2959-41c...|vulputate velit e...|tent  and awning ...|            c|     3.09|             NULL|
| 49167531725|    2021-11-26

In [45]:
merchant_full.count()

                                                                                

13614675

In [49]:
# find the maximum and minimum values ​​of the order_datetime column
merchant_full.select(min("order_datetime").alias("min_date"), max("order_datetime").alias("max_date")).show()

+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2021-02-28|2022-10-26|
+----------+----------+



Compute the number of missing values again, this is the number of missing values for merchant fraud probability:

In [46]:
# create a dictionary with column names and their respective null counts
null_count_dict = {col_name: sum(col(col_name).isNull().cast("int")).alias(col_name) for col_name in merchant_full.columns}

# use agg() to calculate null counts for each column
null_counts_df = merchant_full.agg(*null_count_dict.values())
null_counts_df.show()



+------------+--------------+-------+------------+--------+----+--------+-------------+---------+-----------------+
|merchant_abn|order_datetime|user_id|dollar_value|order_id|name|category|revenue_level|take_rate|fraud_probability|
+------------+--------------+-------+------------+--------+----+--------+-------------+---------+-----------------+
|           0|             0|      0|           0|       0|   0|       0|            0|        0|         13610672|
+------------+--------------+-------+------------+--------+----+--------+-------------+---------+-----------------+



                                                                                

In [47]:
# save as a parquet file
merchant_full.write.parquet('../data/curated/merchant_full', mode='overwrite')

                                                                                