In [51]:
# Loading library
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
import zipfile
import os

# Create a Spark Session
spark = (
    SparkSession.builder.appName("ETL Pipeline")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

# Extract

As we can't use `urlretrive` to get the data from Canvas, please download it to your local machine and move it `data/tables`. Then run the code below to unzip the files

In [22]:
# Assign data path
raw_path = "../data"

for file in os.listdir(f"{raw_path}/tables"):
    if file == ".gitkeep":
        continue
    with zipfile.ZipFile(f"{raw_path}/tables/{file}", "r") as zip_ref:
        zip_ref.extractall(f"{raw_path}/")
    os.remove(f"{raw_path}/tables/{file}")

project-2-bnpl-tables-part1.zip
project-2-bnpl-tables-part2.zip
project-2-bnpl-tables-part3.zip
project-2-bnpl-tables-part4.zip


# Transform

The system use `user_id` as a key for identifying customer in transactions record and fraud probability tables. However, they also have a key-value map of `user_id` and `consumer_id`. We will use `consumer_id` as the only ID for customer. Thus, we will map `user_id` from each table to `consumer_id` and drop the former.

In [33]:
def replace_id(map_df, target_df):
    mapped_df = target_df.join(map_df, on="user_id", how="inner")
    mapped_df = mapped_df.drop('user_id')
    
    return mapped_df

In [27]:
# Load consumer info - a key : value map for user_id to consumer_id
consumer_info = spark.read.parquet(f"{raw_path}/tables/consumer_user_details.parquet")

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [50]:
# Load all files that need to replace user_id
consumer_fraud = spark.read.csv(f"{raw_path}/tables/consumer_fraud_probability.csv", header=True, inferSchema=True)
consumer_fraud = replace_id(consumer_info, consumer_fraud)

In [55]:
# Load transaction data
transaction_p1 = spark.read.parquet(f"{raw_path}/tables/transactions_20210228_20210827_snapshot")
transaction_p1 = replace_id(consumer_info, transaction_p1)

transaction_p2 = spark.read.parquet(f"{raw_path}/tables/transactions_20210828_20220227_snapshot")
transaction_p2 = replace_id(consumer_info, transaction_p2)

transaction_p3 = spark.read.parquet(f"{raw_path}/tables/transactions_20220228_20220828_snapshot")
transaction_p3 = replace_id(consumer_info, transaction_p3)

transaction_records = reduce(DataFrame.unionAll, [transaction_p1, transaction_p2, transaction_p3])
transaction_records

merchant_abn,dollar_value,order_id,order_datetime,consumer_id
62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,651338
15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,179208
64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,467663
60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,1194530
94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,467663
76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20,1194530
67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20,467663
34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20,1194530
70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20,918448
49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20,154128


In [54]:
transaction_p2

user_id,merchant_abn,dollar_value,order_id,order_datetime
14935,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26
1,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26
14936,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26
1,49167531725,51.58228625503599,7080c274-17f7-4cc...,2021-11-26
14936,31101120643,25.2281149424178,8e301c0f-06ab-45c...,2021-11-26
2,67978471888,691.5028234458998,0380e9ad-b0e8-420...,2021-11-26
14936,60956456424,102.13952056640888,5ac3da9c-5147-452...,2021-11-26
2,47644196714,644.5220654863093,4e368e44-86f8-4de...,2021-11-26
14938,39649557865,209.12780951421405,4d78cd01-4bab-494...,2021-11-26
3,88402174457,141.0387993699113,c50c957d-ecfc-430...,2021-11-26
