In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [28]:
import os

data_loc = "../data/tables/"
consumer = spark.read.parquet(data_loc+"consumer_user_details.parquet")
tbl_consumer = spark.read.csv(data_loc+"tbl_consumer.csv", sep="|", header=True)
tbl_merchant = spark.read.parquet(data_loc+"tbl_merchants.parquet")
transactions = [data_loc + i + "/" for i in os.listdir(data_loc) if "transactions" in i]
print(transactions)
transact = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot/')
transact = transact.union(spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot/'))
transact = transact.union(spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot/'))

['../data/tables/transactions_20210228_20210827_snapshot/', '../data/tables/transactions_20220228_20220828_snapshot/', '../data/tables/transactions_20210828_20220227_snapshot/']


                                                                                

<h2> 1. Standardize column format of tables</h2>

<h3>1.1 "tbl_merchants.parquet"</h3>

In [29]:
tbl_merchant.show(5)

+--------------------+--------------------+------------+
|                name|                tags|merchant_abn|
+--------------------+--------------------+------------+
|       Felis Limited|((furniture, home...| 10023283211|
|Arcu Ac Orci Corp...|([cable, satellit...| 10142254217|
|    Nunc Sed Company|([jewelry, watch,...| 10165489824|
|Ultricies Digniss...|([wAtch, clock, a...| 10187291046|
| Enim Condimentum PC|([music shops - m...| 10192359162|
+--------------------+--------------------+------------+
only showing top 5 rows



In [30]:
# separate "tags" into "products", "revenue_level", "take_rate"
preprocessed_tbl_merchant = tbl_merchant.withColumn("split_tags", split(col("tags"), ",")) \
            .withColumn("take_rate", element_at(col("split_tags"), -1)) \
            .withColumn("revenue_level", element_at(col("split_tags"), -2)) \
            .withColumn("products", slice(reverse(col("split_tags")), 3, 100)) \
            .withColumn("products", reverse(col("products")))

In [31]:
# remove the special characters of "revenue_level" and "take_rate"
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumn("revenue_level", regexp_replace("revenue_level", "[^a-e]", "")) \
                                                     .withColumn("take_rate", regexp_replace("take_rate", "[^0-9.]", ""))

In [32]:
# standardize the values in "products" and vectorize "products" into the bags of word
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumn("products", concat_ws(",", col("products"))) \
                                                     .withColumn("products", regexp_replace("products", "[^A-Za-z0-9]", " ")) \
                                                     .withColumn("products", regexp_replace("products", "\s+", " ")) \
                                                     .withColumn("products", regexp_replace("products", "(^\s+)|(\s+$)", "")) \
                                                     .withColumn("products", lower(col("products"))) \
                                                    #  .withColumn("products", split(col("products"), " "))

In [33]:
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumnRenamed("name", "merchant_name")
preprocessed_tbl_merchant = preprocessed_tbl_merchant.select("merchant_name", "products", "revenue_level",
                                                             preprocessed_tbl_merchant.take_rate.cast("double"), "merchant_abn")

In [34]:
preprocessed_tbl_merchant.show(5)

+--------------------+--------------------+-------------+---------+------------+
|       merchant_name|            products|revenue_level|take_rate|merchant_abn|
+--------------------+--------------------+-------------+---------+------------+
|       Felis Limited|furniture home fu...|            e|     0.18| 10023283211|
|Arcu Ac Orci Corp...|cable satellite a...|            b|     4.22| 10142254217|
|    Nunc Sed Company|jewelry watch clo...|            b|      4.4| 10165489824|
|Ultricies Digniss...|watch clock and j...|            b|     3.29| 10187291046|
| Enim Condimentum PC|music shops music...|            a|     6.33| 10192359162|
+--------------------+--------------------+-------------+---------+------------+
only showing top 5 rows



In [35]:
tags = preprocessed_tbl_merchant.rdd.map(lambda x: x[1]).collect()
tags_unique = " ".join(tags).split()
tags_unique = list(set(tags_unique))
print(len(tags_unique)) # not too many, can check some by hand, e.g. 'except' is the only word that has negative meaning
tags_unique

95


['dealers',
 'cable',
 'systems',
 'supplies',
 'service',
 'books',
 'nursery',
 'souvenir',
 'spas',
 'sales',
 'al',
 'computers',
 'shoe',
 'supply',
 'leasing',
 'eyeglasses',
 'services',
 'satellite',
 'newspapers',
 'antique',
 'restoration',
 'novelty',
 'software',
 'goods',
 'other',
 'tent',
 'pay',
 'appliance',
 'periodicals',
 'new',
 'opticians',
 'rent',
 'digital',
 'watch',
 'toy',
 'manufacturers',
 'garden',
 'florists',
 'motor',
 'telecom',
 'optical',
 'programming',
 'music',
 'design',
 'jewelry',
 'awning',
 'galleries',
 'including',
 'gift',
 'stock',
 'beauty',
 'musical',
 'processing',
 'movies',
 'repair',
 'lawn',
 'computer',
 'printing',
 'parts',
 'craft',
 'furnishings',
 'sheet',
 'repairs',
 'tool',
 'equipment',
 'radio',
 'stationery',
 'vehicle',
 'appliances',
 'peripheral',
 'game',
 'home',
 'furniture',
 'silverware',
 'television',
 'paper',
 'nurseries',
 'flowers',
 'artist',
 'bicycle',
 'health',
 'hobby',
 'instruments',
 'and',
 'of

In [36]:
tags_tok = tags
for i in range(len(tags)):
    tags_tok[i] = tags_tok[i].split()
    cur = [tags_tok[i][0]]
    for j in range(1, len(tags_tok[i])):
        if tags_tok[i][j-1] != "except":
            if tags_tok[i][j] not in ["and", "except", "other", "shops", "services"]:
                cur += [tags_tok[i][j]]
    tags_tok[i] = cur
# tags_tok

In [37]:
tags_tok_unique = []
for i in tags_tok:
    if i not in tags_tok_unique:
        tags_tok_unique += [i]
print(len(tags_tok_unique)) # only 25 unique ones, can categorise manually
tags_tok_unique

25


[['furniture', 'home', 'furnishings', 'equipment', 'manufacturers'],
 ['cable', 'satellite', 'pay', 'television', 'radio'],
 ['jewelry', 'watch', 'clock', 'silverware'],
 ['watch', 'clock', 'jewelry', 'repair'],
 ['music', 'musical', 'instruments', 'pianos', 'sheet', 'music'],
 ['gift', 'card', 'novelty', 'souvenir'],
 ['computers', 'computer', 'peripheral', 'equipment', 'software'],
 ['computer',
  'programming',
  'data',
  'processing',
  'integrated',
  'systems',
  'design'],
 ['equipment', 'tool', 'furniture', 'appliance', 'rent', 'al', 'leasing'],
 ['artist', 'supply', 'craft'],
 ['florists', 'supplies', 'nursery', 'stock', 'flowers'],
 ['antique', 'sales', 'repairs', 'restoration'],
 ['motor', 'vehicle', 'supplies', 'new', 'parts'],
 ['books', 'periodicals', 'newspapers'],
 ['stationery', 'office', 'supplies', 'printing', 'writing', 'paper'],
 ['tent', 'awning'],
 ['art', 'dealers', 'galleries'],
 ['bicycle', 'sales', 'service'],
 ['digital', 'goods', 'books', 'movies', 'music'

In [38]:
import collections

categories = [
    'home and technology', 'home and technology', 'fashion and accessories', 'fashion and accessories', 'books and music',
    'art and gifts', 'home and technology', 'home and technology', 'home and technology', 'art and gifts',
    'outdoors', 'art and gifts', 'outdoors', 'books and music', 'books and music',
    'outdoors', 'art and gifts', 'outdoors', 'books and music', 'fashion and accessories',
    'fashion and accessories', 'fashion and accessories', 'books and music', 'outdoors', 'home and technology'
]

tag_to_cat = {" ".join(tags_tok_unique[i]): categories[i] for i in range(len(categories))}
cat_list = [tag_to_cat[" ".join(i)] for i in tags_tok]

collections.Counter([i for i in cat_list]).most_common()

[('home and technology', 988),
 ('outdoors', 832),
 ('books and music', 829),
 ('fashion and accessories', 761),
 ('art and gifts', 616)]

In [39]:
from pyspark.sql.types import *

@udf(ArrayType(StringType()))
def tokenise(tag):
    tag = tag.split()
    cur = [tag[0]]
    for i in range(1, len(tag)):
        if tag[i-1] != "except":
            if tag[i] not in ["and", "except", "other", "shops", "services"]:
                cur += [tag[i]]
    return cur

@udf(StringType())
def categorise(tag):
    tag = " ".join(tag)
    return tag_to_cat[tag]

In [40]:
preprocessed_tbl_merchant_token = preprocessed_tbl_merchant.withColumn("tag", tokenise(col("products")))
preprocessed_tbl_merchant_cat = preprocessed_tbl_merchant_token.withColumn("category", categorise(col("tag")))
preprocessed_tbl_merchant_cat = preprocessed_tbl_merchant_cat.select(*(preprocessed_tbl_merchant_cat.columns[:5]),
                                                                concat_ws(' ', 'tag').alias('tag'), "category")
preprocessed_tbl_merchant_cat

merchant_name,products,revenue_level,take_rate,merchant_abn,tag,category
Felis Limited,furniture home fu...,e,0.18,10023283211,furniture home fu...,home and technology
Arcu Ac Orci Corp...,cable satellite a...,b,4.22,10142254217,cable satellite p...,home and technology
Nunc Sed Company,jewelry watch clo...,b,4.4,10165489824,jewelry watch clo...,fashion and acces...
Ultricies Digniss...,watch clock and j...,b,3.29,10187291046,watch clock jewel...,fashion and acces...
Enim Condimentum PC,music shops music...,a,6.33,10192359162,music musical ins...,books and music
Fusce Company,gift card novelty...,a,6.34,10206519221,gift card novelty...,art and gifts
Aliquam Enim Inco...,computers compute...,b,4.32,10255988167,computers compute...,home and technology
Ipsum Primis Ltd,watch clock and j...,c,2.39,10264435225,watch clock jewel...,fashion and acces...
Pede Ultrices Ind...,computer programm...,a,5.71,10279061213,computer programm...,home and technology
Nunc Inc.,furniture home fu...,a,6.61,10323485998,furniture home fu...,home and technology


<h3>1.2 "transactions_20210228_20210827_snapshot"</h3>

In [41]:
transact.show(5)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|
|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|
|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|
|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|
|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



In [42]:
# separate pickup datetime into date, year, month and day
preprocessed_transact = transact.withColumn('order_year', year(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumn('order_month', month(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumn('order_day', dayofmonth(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumnRenamed('merchant_abn', 'merchant_abn_repeat')

<h3>1.3 "tbl_consumer.csv"</h3>

In [43]:
tbl_consumer.show(5)

+-----------------+--------------------+-----+--------+------+-----------+
|             name|             address|state|postcode|gender|consumer_id|
+-----------------+--------------------+-----+--------+------+-----------+
| Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|    1195503|
|       Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|     179208|
|    Jill Jones MD|  40693 Henry Greens|   NT|     862|Female|    1194530|
|  Lindsay Jimenez|00653 Davenport C...|  NSW|    2780|Female|     154128|
|Rebecca Blanchard|9271 Michael Mano...|   WA|    6355|Female|     712975|
+-----------------+--------------------+-----+--------+------+-----------+
only showing top 5 rows



In [44]:
preprocessed_tbl_consumer = tbl_consumer.withColumnRenamed("name", "consumer")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("address", "consumer_address")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("state", "consumer_state")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("postcode", "consumer_postcode")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("gender", "consumer_gender")

In [45]:
preprocessed_tbl_consumer.show(5)

+-----------------+--------------------+--------------+-----------------+---------------+-----------+
|         consumer|    consumer_address|consumer_state|consumer_postcode|consumer_gender|consumer_id|
+-----------------+--------------------+--------------+-----------------+---------------+-----------+
| Yolanda Williams|413 Haney Gardens...|            WA|             6935|         Female|    1195503|
|       Mary Smith|     3764 Amber Oval|           NSW|             2782|         Female|     179208|
|    Jill Jones MD|  40693 Henry Greens|            NT|              862|         Female|    1194530|
|  Lindsay Jimenez|00653 Davenport C...|           NSW|             2780|         Female|     154128|
|Rebecca Blanchard|9271 Michael Mano...|            WA|             6355|         Female|     712975|
+-----------------+--------------------+--------------+-----------------+---------------+-----------+
only showing top 5 rows



<h3>1.4 "consumer.parquet"</h3>

In [46]:
consumer.show(5)

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
|      3|    1194530|
|      4|     154128|
|      5|     712975|
+-------+-----------+
only showing top 5 rows



In [47]:
preprocessed_consumer = consumer.withColumnRenamed("user_id", "user_id_repeat")
preprocessed_consumer = preprocessed_consumer.withColumnRenamed("consumer_id", "consumer_id_repeat")

In [48]:
preprocessed_consumer.show(5)

+--------------+------------------+
|user_id_repeat|consumer_id_repeat|
+--------------+------------------+
|             1|           1195503|
|             2|            179208|
|             3|           1194530|
|             4|            154128|
|             5|            712975|
+--------------+------------------+
only showing top 5 rows



<h2>2. Join tables</h2>

In [49]:
preprocessed_tbl_merchant_cat.createOrReplaceTempView('merchant')
preprocessed_transact.createOrReplaceTempView('transact')
preprocessed_tbl_consumer.createOrReplaceTempView('tbl_consumer')
preprocessed_consumer.createOrReplaceTempView('consumer')

join = spark.sql("""
SELECT 
    *
FROM 
    merchant
INNER JOIN
    transact
ON 
    transact.merchant_abn_repeat = merchant.merchant_abn
INNER JOIN
    consumer
ON
    transact.user_id = consumer.user_id_repeat
INNER JOIN
    tbl_consumer
ON 
    consumer.consumer_id_repeat = tbl_consumer.consumer_id
ORDER BY
    revenue_level DESC
""")

join = join.drop("merchant_abn_repeat", "consumer_id_repeat", "user_id_repeat", "merchant_abn", "consumer_id", "order_id")
print(f"Mismatched (null) transactions dropped: {preprocessed_transact.count() - join.count()}")



Mismatched (null) transactions dropped: 580830


                                                                                

In [50]:
join

                                                                                

merchant_name,products,revenue_level,take_rate,tag,category,user_id,dollar_value,order_datetime,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender
Nullam Vitae Diam PC,opticians optical...,e,0.22,opticians optical...,fashion and acces...,23679,72.77954022414646,2021-09-13,2021,9,13,Roberto Robbins,3146 Eric Turnpik...,NSW,2396,Male
Et Nunc Consulting,books periodicals...,e,0.16,books periodicals...,books and music,13065,481.02863662953126,2021-12-18,2021,12,18,Mark Mercer,74588 Erica Roads,VIC,3873,Male
Augue Eu Tempor A...,cable satellite a...,e,0.38,cable satellite p...,home and technology,6305,36.24855550060353,2022-10-06,2022,10,6,Allison Stevens,60866 Miller Avenue,NSW,1193,Female
Et Nunc Consulting,books periodicals...,e,0.16,books periodicals...,books and music,3698,263.7003863435179,2022-03-27,2022,3,27,Christopher Rodri...,30554 Evans Strea...,NSW,2299,Male
Et Nunc Consulting,books periodicals...,e,0.16,books periodicals...,books and music,6305,147.74061765538696,2021-12-09,2021,12,9,Allison Stevens,60866 Miller Avenue,NSW,1193,Female
Magna Sed Institute,cable satellite a...,e,0.31,cable satellite p...,home and technology,832,9.737964095722326,2021-05-10,2021,5,10,Brian Barnett,989 Robinson Street,NSW,2323,Male
Dis Parturient Co...,gift card novelty...,e,0.24,gift card novelty...,art and gifts,6305,140.36239513919142,2021-11-08,2021,11,8,Allison Stevens,60866 Miller Avenue,NSW,1193,Female
Nunc Est LLP,watch clock and j...,e,0.22,watch clock jewel...,fashion and acces...,23679,186.91415986083496,2021-05-27,2021,5,27,Roberto Robbins,3146 Eric Turnpik...,NSW,2396,Male
Ante Industries,motor vehicle sup...,e,0.35,motor vehicle sup...,outdoors,2199,86.65631054124275,2022-09-21,2022,9,21,Kelly Krueger,039 Green Route,WA,6935,Undisclosed
Metus Sit Amet In...,cable satellite a...,e,0.38,cable satellite p...,home and technology,23679,20.49850981713674,2021-07-04,2021,7,4,Roberto Robbins,3146 Eric Turnpik...,NSW,2396,Male


In [53]:
join.write.parquet("../data/curated/raw_join_internal_table")

                                                                                