In [18]:
# Loading library
import os
os.sys.path.append("../")
from scripts.etl_pipeline import *

In [19]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("ETL Pipeline")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

# Extract

As we can't use `urlretrieve` to get the data from Canvas, please download it to your local machine and move it `data/tables`. Then run the code below to unzip the files

In [20]:
# Assign data path
raw_path = "../data"

# Unzip files (Only run once)
"""
for file in os.listdir(f"{raw_path}/tables"):
    if file == ".gitkeep":
        continue
    with zipfile.ZipFile(f"{raw_path}/tables/{file}", "r") as zip_ref:
        zip_ref.extractall(f"{raw_path}/")
    os.remove(f"{raw_path}/tables/{file}")
"""

'\nfor file in os.listdir(f"{raw_path}/tables"):\n    if file == ".gitkeep":\n        continue\n    with zipfile.ZipFile(f"{raw_path}/tables/{file}", "r") as zip_ref:\n        zip_ref.extractall(f"{raw_path}/")\n    os.remove(f"{raw_path}/tables/{file}")\n'

# Transform

The system use `user_id` as a key for identifying customer in transactions record and fraud probability tables. However, they also have a key-value map of `user_id` and `consumer_id`. We will use `consumer_id` as the only ID for customer. Thus, we will map `user_id` from each table to `consumer_id` and drop the former.


In [21]:
# Load consumer user details -> a key:value map for user_id to consumer_id
consumer_user_map = spark.read.parquet(f"{raw_path}/tables/consumer_user_details.parquet")
consumer_user_map.limit(5) # Preview

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [22]:
# Load consumer fraud rate dataset
consumer_fraud_rate = spark.read.csv(f"{raw_path}/tables/consumer_fraud_probability.csv", header=True, inferSchema=True)
consumer_fraud_rate.show(5)
get_dataset_count(consumer_fraud_rate)

# Replace all user_id with unique consumer_id
consumer_fraud_rate = replace_id(consumer_user_map, consumer_fraud_rate)
consumer_fraud_rate.show(5)

# Check to make sure no rows were lost on the inner join
get_dataset_count(consumer_fraud_rate)

+-------+--------------+-----------------+
|user_id|order_datetime|fraud_probability|
+-------+--------------+-----------------+
|   6228|    2021-12-19| 97.6298077657765|
|  21419|    2021-12-10|99.24738020302328|
|   5606|    2021-10-17|84.05825045251777|
|   3101|    2021-04-17|91.42192091901347|
|  22239|    2021-10-19|94.70342477508035|
+-------+--------------+-----------------+
only showing top 5 rows

The dataset count is  34864
+--------------+------------------+-----------+
|order_datetime| fraud_probability|consumer_id|
+--------------+------------------+-----------+
|    2022-02-20| 9.805431136520959|    1195503|
|    2021-08-30| 9.599513915425788|     179208|
|    2021-09-25|10.069850934775245|     179208|
|    2021-11-03| 8.300636455314633|    1194530|
|    2021-10-09| 9.633302411090419|     154128|
+--------------+------------------+-----------+
only showing top 5 rows

The dataset count is  34864


We see that there is no change in the number of entries upon an inner join.

In [23]:
# Load all the transaction data 
transaction_p1 = spark.read.parquet(f"{raw_path}/tables/transactions_20210228_20210827_snapshot")
transaction_p2 = spark.read.parquet(f"{raw_path}/tables/transactions_20210828_20220227_snapshot")
transaction_p3 = spark.read.parquet(f"{raw_path}/tables/transactions_20220228_20220828_snapshot")

# Combine the datasets
transaction_records = reduce(DataFrame.unionAll, [transaction_p1, transaction_p2, transaction_p3])
get_dataset_count(transaction_records)

# Replace user_id with consumer_id after combining
transaction_records = replace_id(consumer_user_map, transaction_records)

# Check to make sure no rows were lost on the inner join
get_dataset_count(transaction_records)

The dataset count is  14195505
The dataset count is  14195505


## Cleaning

Now that replacing `user_id` to `consumer_id` is done, load all other data and clean them. We start off with the merchant fraud probability.

In [24]:
# Load consumer fraud rate dataset
merchant_fraud_rate = spark.read.csv(f"{raw_path}/tables/merchant_fraud_probability.csv", header=True, inferSchema=True)
merchant_fraud_rate.show(5)
get_dataset_count(merchant_fraud_rate)

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 19492220327|    2021-11-28|44.403658647495355|
| 31334588839|    2021-10-02| 42.75530083865367|
| 19492220327|    2021-12-22|38.867790051131095|
| 82999039227|    2021-12-19|  94.1347004808891|
| 90918180829|    2021-09-02| 43.32551731714902|
+------------+--------------+------------------+
only showing top 5 rows

The dataset count is  114


Cleaning `tbl_merchants.parquet`. The feature `tags` is a string that represents either a tuple or a list, containing 3 elements:
* Items that are being sold
* Revenue levels
* Commission rate

Each elements either a list, a tuple, or a combination of both (e.g starts with `[` and ends with `)` and vice versa). These inconsistencies are mostly due to human errors. Thus, we need to take into account these consistent when splitting the values of the feature `tags` into separate columns

In [25]:
# Load merchant's info
merchant_info = spark.read.parquet(f"{raw_path}/tables/tbl_merchants.parquet")
merchant_info.show(5, truncate=False)

# Clean the data
merchant_info = clean_merchant_details(merchant_info)
merchant_info.show(5, truncate=False)

+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+
|name                                |tags                                                                                                             |merchant_abn|
+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+
|Felis Limited                       |((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))|10023283211 |
|Arcu Ac Orci Corporation            |([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])                        |10142254217 |
|Nunc Sed Company                    |([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])                                          |10165489824 |
|Ult

The data on consumer's basic information is a single column that contains the consumer's name, address, state, postcode, gender, and their unqiue consumer ID, each separated by "`|`". Thus, we will need to split these into individual columns. Based on the `README.md` for the data, we will only keep the consumer's name, state, postcode, gender, and consumer ID as the addresses are fake.

In [26]:
# Load consumer info's
consumer_info = spark.read.csv(f"{raw_path}/tables/tbl_consumer.csv", header=True, inferSchema=True)
consumer_info.show(5, truncate=False)

# Clean the data
consumer_info = clean_consumer_details(consumer_info)
consumer_info.show(5, truncate=False)

+---------------------------------------------------------------------+
|name|address|state|postcode|gender|consumer_id                       |
+---------------------------------------------------------------------+
|Yolanda Williams|413 Haney Gardens Apt. 742|WA|6935|Female|1195503   |
|Mary Smith|3764 Amber Oval|NSW|2782|Female|179208                    |
|Jill Jones MD|40693 Henry Greens|NT|862|Female|1194530               |
|Lindsay Jimenez|00653 Davenport Crossroad|NSW|2780|Female|154128     |
|Rebecca Blanchard|9271 Michael Manors Suite 651|WA|6355|Female|712975|
+---------------------------------------------------------------------+
only showing top 5 rows

Before: 
The dataset count is  499999
After: 
The dataset count is  499999
+-----------------+-----------+------+-----+--------+
|name             |consumer_id|gender|state|postcode|
+-----------------+-----------+------+-----+--------+
|Yolanda Williams |1195503    |Female|WA   |6935    |
|Mary Smith       |179208     |Femal

We will also need to ensure that the datetime of all dataset with such column is within the specified range (labeled on the name of the intial downloaded file)

In [27]:
transaction_records = ensure_datetime_range(transaction_records, "2021-02-28", "2022-08-28")

                                                                                

Starting entries: 14195505 
Final entries: 12561377
Net change (%): 11.51 


In [28]:
merchant_fraud_rate = ensure_datetime_range(merchant_fraud_rate, "2021-02-08", "2022-08-28")

Starting entries: 114 
Final entries: 114
Net change (%): 0.0 


In [29]:
consumer_fraud_rate = ensure_datetime_range(consumer_fraud_rate, "2021-02-08", "2022-08-28")

Starting entries: 34864 
Final entries: 34864
Net change (%): 0.0 


Next, we check for any existing null values across all dataset.

In [30]:
calculate_missing_values(consumer_fraud_rate)
calculate_missing_values(merchant_fraud_rate)
calculate_missing_values(consumer_info)
calculate_missing_values(merchant_info)
calculate_missing_values(transaction_records)

+----------------------------+-------------------------------+-------------------------+
|order_datetime_missing_count|fraud_probability_missing_count|consumer_id_missing_count|
+----------------------------+-------------------------------+-------------------------+
|                           0|                              0|                        0|
+----------------------------+-------------------------------+-------------------------+

+--------------------------+----------------------------+-------------------------------+
|merchant_abn_missing_count|order_datetime_missing_count|fraud_probability_missing_count|
+--------------------------+----------------------------+-------------------------------+
|                         0|                           0|                              0|
+--------------------------+----------------------------+-------------------------------+

+------------------+-------------------------+--------------------+-------------------+----------------



+--------------------------+--------------------------+----------------------+----------------------------+-------------------------+
|merchant_abn_missing_count|dollar_value_missing_count|order_id_missing_count|order_datetime_missing_count|consumer_id_missing_count|
+--------------------------+--------------------------+----------------------+----------------------------+-------------------------+
|                         0|                         0|                     0|                           0|                        0|
+--------------------------+--------------------------+----------------------+----------------------------+-------------------------+



                                                                                

Currently, there are no mising values after we do some cleaning. We will come back to this after we merged the data together.

# Load

In [31]:
landing_directory = "../data/curated"

In [32]:
consumer_fraud_rate.write.parquet(f"{landing_directory}/consumer_fraud_prob.parquet")
merchant_fraud_rate.write.parquet(f"{landing_directory}/merchant_fraud_prob.parquet")
transaction_records.write.parquet(f"{landing_directory}/transactions.parquet")
merchant_info.write.parquet(f"{landing_directory}/merchant_info.parquet")
consumer_info.write.parquet(f"{landing_directory}/consumer_info.parquet")

                                                                                