In [30]:
# Loading library
from functools import reduce
from pyspark.sql import SparkSession, DataFrame, functions as F
from pyspark.sql.types import DoubleType, StringType, IntegerType
import zipfile
import os

# Create a Spark Session
spark = (
    SparkSession.builder.appName("ETL Pipeline")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

# Extract

As we can't use `urlretrive` to get the data from Canvas, please download it to your local machine and move it `data/tables`. Then run the code below to unzip the files

In [None]:
# Assign data path
raw_path = "../data"

for file in os.listdir(f"{raw_path}/tables"):
    if file == ".gitkeep":
        continue
    with zipfile.ZipFile(f"{raw_path}/tables/{file}", "r") as zip_ref:
        zip_ref.extractall(f"{raw_path}/")
    os.remove(f"{raw_path}/tables/{file}")

# Transform

The system use `user_id` as a key for identifying customer in transactions record and fraud probability tables. However, they also have a key-value map of `user_id` and `consumer_id`. We will use `consumer_id` as the only ID for customer. Thus, we will map `user_id` from each table to `consumer_id` and drop the former.

In [5]:
def replace_id(map_df, target_df):
    mapped_df = target_df.join(map_df, on="user_id", how="inner")
    mapped_df = mapped_df.drop('user_id')
    
    return mapped_df

In [6]:
# Load consumer info - a key : value map for user_id to consumer_id
consumer_info = spark.read.parquet(f"{raw_path}/tables/consumer_user_details.parquet")

                                                                                

In [9]:
# Load all files that need to replace user_id
consumer_fraud_rate = spark.read.csv(f"{raw_path}/tables/consumer_fraud_probability.csv", header=True, inferSchema=True)
consumer_fraud_rate = replace_id(consumer_info, consumer_fraud_rate)

In [15]:
consumer_fraud_rate.limit(2)

order_datetime,fraud_probability,consumer_id
2022-02-20,9.80543113652096,1195503
2021-08-30,9.599513915425788,179208


In [None]:
# Load transaction data for user_id replacement
transaction_p1 = spark.read.parquet(f"{raw_path}/tables/transactions_20210228_20210827_snapshot")
transaction_p1 = replace_id(consumer_info, transaction_p1)

transaction_p2 = spark.read.parquet(f"{raw_path}/tables/transactions_20210828_20220227_snapshot")
transaction_p2 = replace_id(consumer_info, transaction_p2)

transaction_p3 = spark.read.parquet(f"{raw_path}/tables/transactions_20220228_20220828_snapshot")
transaction_p3 = replace_id(consumer_info, transaction_p3)

transaction_records = reduce(DataFrame.unionAll, [transaction_p1, transaction_p2, transaction_p3])
transaction_records

In [14]:
transaction_records.limit(2)

merchant_abn,dollar_value,order_id,order_datetime,consumer_id
62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,651338
15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,179208


Now that replacing `user_id` to `consumer_id` is done, load all other data

In [12]:
# Load merchant fraud probability
merchant_fraud_rate = spark.read.csv(f"{raw_path}/tables/merchant_fraud_probability.csv", header=True, inferSchema=True)
merchant_fraud_rate.limit(2)

merchant_abn,order_datetime,fraud_probability
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367


In [137]:
# Load merchant's info
merchant_info = spark.read.parquet(f"{raw_path}/tables/tbl_merchants.parquet")
merchant_info = merchant_info.withColumn("tags", F.regexp_replace("tags", r"^[\(\[]|[\)\]]$", "")) # Remove the outermost bracket
merchant_info = merchant_info.withColumn("tags", F.regexp_replace("tags", r"[\)\]],\s*[\(\[]", r")\|(")) # Replacing the comma that seperate each touple/list into "|"
merchant_info = merchant_info.withColumn("tags", F.split("tags", "\|")) # split accorddingly 
merchant_info = merchant_info.withColumns({"category": F.regexp_replace(F.col("tags").getItem(0), r"^[\(\[]|[\)\]]$", ""),
                                           "revenue_levels": F.regexp_replace(F.col("tags").getItem(1), r"^[\(\[]|[\)\]]$", ""),
                                           "take_rate": F.regexp_extract(F.col("tags").getItem(2), r"take rate: (\d+\.\d+)",1).cast(DoubleType())
                                          })
merchant_info.show(truncate=False)

+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------+--------------+---------+
|name                                |tags                                                                                                             |merchant_abn|category                                                                             |revenue_levels|take_rate|
+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------+--------------+---------+
|Felis Limited                       |[(furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18)]|10023283211 |furn

In [110]:
# Load consumer's info and reformat
consumer_info = spark.read.csv(f"{raw_path}/tables/tbl_consumer.csv", header=True, inferSchema=True)
consumer_info = consumer_info.withColumn("info", F.split(F.col("name|address|state|postcode|gender|consumer_id"), "\|")).drop(F.col("name|address|state|postcode|gender|consumer_id"))
consumer_info = consumer_info.withColumns({"consumer_id": F.col("info").getItem(5),
                                           "name": F.col("info").getItem(0),
                                           "postcode": F.col("info").getItem(3).cast(IntegerType()),
                                           "gender": F.col("info").getItem(4)}).drop(F.col("info"))
consumer_info.groupBy("gender").count() # relatively same proportion of female and male customer, only a small percentage of did not provide their gender

gender,count
Undisclosed,50074
Female,224946
Male,224979
