In [2]:
# Loading library
import os
os.sys.path.append("../")
from scripts.etl_pipeline import *

In [3]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("ETL Pipeline")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.execturo.memory", "2g")
    .getOrCreate()
)

24/08/29 23:45:40 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 100.92.15.134 instead (on interface en0)
24/08/29 23:45:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/29 23:45:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/29 23:45:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Extract

As we can't use `urlretrieve` to get the data from Canvas, please download it to your local machine and move it `data/tables`. Then run the code below to unzip the files

In [4]:
# Assign data path
raw_path = "../data"

In [None]:


for file in os.listdir(f"{raw_path}/tables"):
    if file == ".gitkeep":
        continue
    with zipfile.ZipFile(f"{raw_path}/tables/{file}", "r") as zip_ref:
        zip_ref.extractall(f"{raw_path}/")
    os.remove(f"{raw_path}/tables/{file}")

# Transform

The system use `user_id` as a key for identifying customer in transactions record and fraud probability tables. However, they also have a key-value map of `user_id` and `consumer_id`. We will use `consumer_id` as the only ID for customer. Thus, we will map `user_id` from each table to `consumer_id` and drop the former.


In [5]:
# Load consumer info - a key : value map for user_id to consumer_id
consumer_info = spark.read.parquet(f"{raw_path}/tables/consumer_user_details.parquet")

24/08/29 23:45:55 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
# Load all files that need to replace user_id
consumer_fraud_rate = spark.read.csv(f"{raw_path}/tables/consumer_fraud_probability.csv", header=True, inferSchema=True)
consumer_fraud_rate = replace_id(consumer_info, consumer_fraud_rate)

                                                                                

In [7]:
# Load transaction data for user_id replacement
transaction_p1 = spark.read.parquet(f"{raw_path}/tables/transactions_20210228_20210827_snapshot")
transaction_p1 = replace_id(consumer_info, transaction_p1)

transaction_p2 = spark.read.parquet(f"{raw_path}/tables/transactions_20210828_20220227_snapshot")
transaction_p2 = replace_id(consumer_info, transaction_p2)

transaction_p3 = spark.read.parquet(f"{raw_path}/tables/transactions_20220228_20220828_snapshot")
transaction_p3 = replace_id(consumer_info, transaction_p3)

transaction_records = reduce(DataFrame.unionAll, [transaction_p1, transaction_p2, transaction_p3])

                                                                                

Now that replacing `user_id` to `consumer_id` is done, load all other data and clean them

In [7]:
# Load merchant fraud probability
# merchant_fraud_rate = spark.read.csv(f"{raw_path}/tables/merchant_fraud_probability.csv", header=True, inferSchema=True)

# date_pattern = r"^\d{4}-\d{2}-\d{2}$"

# test = merchant_fraud_rate.withColumn("is_valid_date", F.regexp_extract(F.col("order_datetime"), date_pattern, 0))
# invalid_dates = test.filter(F.col("is_valid_date") == "")
# invalid_dates

In [8]:
merchant_fraud = spark.read.csv(f"{raw_path}/tables/merchant_fraud_probability.csv", header=True, inferSchema=True)
print(f'Total number of merchants: {transaction_records.select("merchant_abn").distinct().count()}')
print(f'Total number of merchants with fraudulent probability: {merchant_fraud.select("merchant_abn").distinct().count()}')

                                                                                

Total number of merchants: 4422
Total number of merchants with fraudulent probability: 61


Cleaning `tbl_merchants.parquet`. The feature `tags` is a string that represent either a tuple or a list, containing 3 elements:
* Items that are being sold
* Revenue levels
* Commission rate
Each elements either a list, a tuple, or a combination of both (e.g starts with `[` and ends with `)` and vice versa). These inconsistencies are mostly due to human errors. Thus, we need to take into account these consistent when splitting the values of the feature `tags` into separate columns

In [9]:
# Load merchant's info and clean it
merchant_info = spark.read.parquet(f"{raw_path}/tables/tbl_merchants.parquet")
merchant_info = load_merchant_details(merchant_info)
merchant_info.printSchema()
merchant_info.limit(2)
# merchant_info.groupBy(F.col("revenue_level")).agg(F.avg(F.col("take_rate"))) # average commission rate of each revenue level

root
 |-- name: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- category: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



name,merchant_abn,category,revenue_level,take_rate
Felis Limited,10023283211,"furniture, home f...",e,0.18
Arcu Ac Orci Corp...,10142254217,"cable, satellite,...",b,4.22


In [10]:
# Load consumer's info and reformat
consumer_info = spark.read.csv(f"{raw_path}/tables/tbl_consumer.csv", header=True, inferSchema=True)
consumer_info = load_consumer_details(consumer_info)

# consumer_info.groupBy("gender").count() # relatively same proportion of female and male customer, only a small percentage of did not provide their gender
consumer_info

                                                                                

name,consumer_id,gender,state,postcode
Yolanda Williams,1195503,Female,WA,6935
Mary Smith,179208,Female,NSW,2782
Jill Jones MD,1194530,Female,NT,862
Lindsay Jimenez,154128,Female,NSW,2780
Rebecca Blanchard,712975,Female,WA,6355
Karen Chapman,407340,Female,NSW,2033
Andrea Jones,511685,Female,QLD,4606
Stephen Williams,448088,Male,WA,6056
Stephanie Reyes,650435,Female,NSW,2482
Jillian Gonzales,1058499,Female,VIC,3220


In [21]:
# Prepare and load data for geosparial analysis
# Skip if the file exists
if not os.path.isfile("../data/curated/consumer_info.parquet"):
    consumer_info.write.parquet('../data/curated/consumer_info.parquet')


if not os.path.isfile("../data/curated/consumer_fraud_rate.parquet"):
    consumer_fraud_rate.write.parquet('../data/curated/consumer_fraud_rate.parquet')


AnalysisException: [PATH_ALREADY_EXISTS] Path file:/Users/qinsitao/Documents/GitHub/project-2-group-buy-now-pay-later-industry-project-9/data/curated/consumer_info.parquet already exists. Set mode as "overwrite" to overwrite the existing path.