In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/21 21:11:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/21 21:11:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/21 21:11:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
import os

data_loc = "../data/tables/"
consumer = spark.read.parquet(data_loc+"consumer_user_details.parquet")
tbl_consumer = spark.read.csv(data_loc+"tbl_consumer.csv", sep="|", header=True)
tbl_merchant = spark.read.parquet(data_loc+"tbl_merchants.parquet")
transactions = [data_loc + i + "/" for i in os.listdir(data_loc) if "transactions" in i]
print(transactions)
transact = spark.read.parquet('../data/tables/transactions_20210228_20210827_snapshot/')
transact = transact.union(spark.read.parquet('../data/tables/transactions_20220228_20220828_snapshot/'))
transact = transact.union(spark.read.parquet('../data/tables/transactions_20210828_20220227_snapshot/'))

                                                                                

['../data/tables/transactions_20210228_20210827_snapshot/', '../data/tables/transactions_20220228_20220828_snapshot/', '../data/tables/transactions_20210828_20220227_snapshot/']


                                                                                

<h2> 1. Standardize column format of tables</h2>

<h3>1.1 "tbl_merchants.parquet"</h3>

In [3]:
tbl_merchant.show(5)

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+--------------------+------------+
|                name|                tags|merchant_abn|
+--------------------+--------------------+------------+
|       Felis Limited|((furniture, home...| 10023283211|
|Arcu Ac Orci Corp...|([cable, satellit...| 10142254217|
|    Nunc Sed Company|([jewelry, watch,...| 10165489824|
|Ultricies Digniss...|([wAtch, clock, a...| 10187291046|
| Enim Condimentum PC|([music shops - m...| 10192359162|
+--------------------+--------------------+------------+
only showing top 5 rows



                                                                                

In [4]:
# separate "tags" into "products", "revenue_level", "take_rate"
preprocessed_tbl_merchant = tbl_merchant.withColumn("split_tags", split(col("tags"), ",")) \
            .withColumn("take_rate", element_at(col("split_tags"), -1)) \
            .withColumn("revenue_level", element_at(col("split_tags"), -2)) \
            .withColumn("products", slice(reverse(col("split_tags")), 3, 100)) \
            .withColumn("products", reverse(col("products")))


In [5]:
# remove the special characters of "revenue_level" and "take_rate"
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumn("revenue_level", regexp_replace("revenue_level", "[^a-e]", "")) \
                                                     .withColumn("take_rate", regexp_replace("take_rate", "[^0-9.]", ""))

In [6]:
# standardize the values in "products" and vectorize "products" into the bags of word
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumn("products", concat_ws(",", col("products"))) \
                                                     .withColumn("products", regexp_replace("products", "[^A-Za-z0-9]", " ")) \
                                                     .withColumn("products", regexp_replace("products", "\s+", " ")) \
                                                     .withColumn("products", regexp_replace("products", "(^\s+)|(\s+$)", "")) \
                                                     .withColumn("products", lower(col("products"))) \
                                                    #  .withColumn("products", split(col("products"), " "))

In [7]:
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumnRenamed("name", "merchant_name")
preprocessed_tbl_merchant = preprocessed_tbl_merchant.select("merchant_name", "products", "revenue_level",
                                                             preprocessed_tbl_merchant.take_rate.cast("double"), "merchant_abn")

In [8]:
preprocessed_tbl_merchant.show(5)

+--------------------+--------------------+-------------+---------+------------+
|       merchant_name|            products|revenue_level|take_rate|merchant_abn|
+--------------------+--------------------+-------------+---------+------------+
|       Felis Limited|furniture home fu...|            e|     0.18| 10023283211|
|Arcu Ac Orci Corp...|cable satellite a...|            b|     4.22| 10142254217|
|    Nunc Sed Company|jewelry watch clo...|            b|      4.4| 10165489824|
|Ultricies Digniss...|watch clock and j...|            b|     3.29| 10187291046|
| Enim Condimentum PC|music shops music...|            a|     6.33| 10192359162|
+--------------------+--------------------+-------------+---------+------------+
only showing top 5 rows



In [9]:
tags = preprocessed_tbl_merchant.rdd.map(lambda x: x[1]).collect()
tags_unique = " ".join(tags).split()
tags_unique = list(set(tags_unique))
print(len(tags_unique)) # not too many, can check some by hand, e.g. 'except' is the only word that has negative meaning
tags_unique

[Stage 11:>                                                         (0 + 1) / 1]

95


                                                                                

['clock',
 'outlets',
 'motor',
 'musical',
 'lawn',
 'computers',
 'appliances',
 'and',
 'al',
 'systems',
 'repairs',
 'galleries',
 'office',
 'shops',
 'except',
 'services',
 'instruments',
 'novelty',
 'leasing',
 'hobby',
 'software',
 'nurseries',
 'newspapers',
 'craft',
 'shoe',
 'cable',
 'new',
 'toy',
 'tool',
 'garden',
 'music',
 'integrated',
 'printing',
 'periodicals',
 'computer',
 'health',
 'furnishings',
 'supplies',
 'optical',
 'spas',
 'awning',
 'restoration',
 'florists',
 'dealers',
 'equipment',
 'peripheral',
 'sheet',
 'jewelry',
 'radio',
 'vehicle',
 'including',
 'design',
 'watch',
 'flowers',
 'silverware',
 'gift',
 'rent',
 'pianos',
 'satellite',
 'opticians',
 'books',
 'parts',
 'home',
 'movies',
 'card',
 'artist',
 'art',
 'programming',
 'goods',
 'eyeglasses',
 'souvenir',
 'television',
 'telecom',
 'pay',
 'beauty',
 'game',
 'service',
 'sales',
 'paper',
 'bicycle',
 'data',
 'other',
 'writing',
 'antique',
 'digital',
 'tent',
 'appl

In [10]:
for i in range(len(tags)):
    tags[i] = tags[i].split()
    cur = [tags[i][0]]
    for j in range(1, len(tags[i])):
        if tags[i][j-1] != "except":
            if tags[i][j] != "and" and tags[i][j] != "except" and tags[i][j] != "other" and tags[i][j] != "shops":
                cur += [tags[i][j]]
    tags[i] = cur
# tags

In [11]:
from pyspark.sql.types import *

@udf(ArrayType(StringType()))
def tokenise(tag):
    tag = tag.split()
    cur = [tag[0]]
    for i in range(1, len(tag)):
        if tag[i-1] != "except":
            if tag[i] not in ["and", "except", "other", "shops"]:
                cur += [tag[i]]
    return cur

In [12]:
preprocessed_tbl_merchant_token = preprocessed_tbl_merchant.withColumn("tag", tokenise(col("products")))
preprocessed_tbl_merchant_token

                                                                                

merchant_name,products,revenue_level,take_rate,merchant_abn,tag
Felis Limited,furniture home fu...,e,0.18,10023283211,"[furniture, home,..."
Arcu Ac Orci Corp...,cable satellite a...,b,4.22,10142254217,"[cable, satellite..."
Nunc Sed Company,jewelry watch clo...,b,4.4,10165489824,"[jewelry, watch, ..."
Ultricies Digniss...,watch clock and j...,b,3.29,10187291046,"[watch, clock, je..."
Enim Condimentum PC,music shops music...,a,6.33,10192359162,"[music, musical, ..."
Fusce Company,gift card novelty...,a,6.34,10206519221,"[gift, card, nove..."
Aliquam Enim Inco...,computers compute...,b,4.32,10255988167,"[computers, compu..."
Ipsum Primis Ltd,watch clock and j...,c,2.39,10264435225,"[watch, clock, je..."
Pede Ultrices Ind...,computer programm...,a,5.71,10279061213,"[computer, progra..."
Nunc Inc.,furniture home fu...,a,6.61,10323485998,"[furniture, home,..."


In [13]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

hashingTF = HashingTF(inputCol="tag", outputCol="features", numFeatures=20)
preprocessed_tbl_merchant_bow = hashingTF.transform(preprocessed_tbl_merchant_token)
preprocessed_tbl_merchant_bow

merchant_name,products,revenue_level,take_rate,merchant_abn,tag,features
Felis Limited,furniture home fu...,e,0.18,10023283211,"[furniture, home,...","(20,[3,5,6,12,13]..."
Arcu Ac Orci Corp...,cable satellite a...,b,4.22,10142254217,"[cable, satellite...","(20,[6,7,13,16,17..."
Nunc Sed Company,jewelry watch clo...,b,4.4,10165489824,"[jewelry, watch, ...","(20,[8,9,17,18],[..."
Ultricies Digniss...,watch clock and j...,b,3.29,10187291046,"[watch, clock, je...","(20,[2,8,17,18],[..."
Enim Condimentum PC,music shops music...,a,6.33,10192359162,"[music, musical, ...","(20,[3,5,6,10,18]..."
Fusce Company,gift card novelty...,a,6.34,10206519221,"[gift, card, nove...","(20,[4,14,18],[1...."
Aliquam Enim Inco...,computers compute...,b,4.32,10255988167,"[computers, compu...","(20,[3,4,6,11,13]..."
Ipsum Primis Ltd,watch clock and j...,c,2.39,10264435225,"[watch, clock, je...","(20,[2,8,17,18],[..."
Pede Ultrices Ind...,computer programm...,a,5.71,10279061213,"[computer, progra...","(20,[1,2,6,9,11,1..."
Nunc Inc.,furniture home fu...,a,6.61,10323485998,"[furniture, home,...","(20,[3,5,6,12,13]..."


In [14]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5)
model = kmeans.fit(preprocessed_tbl_merchant_bow.select("features"))
preprocessed_tbl_merchant_cat = model.transform(preprocessed_tbl_merchant_bow)
preprocessed_tbl_merchant_cat = preprocessed_tbl_merchant_cat.select("merchant_name", "products", "revenue_level", "take_rate", "merchant_abn", col("prediction").alias("category"))
preprocessed_tbl_merchant_cat

                                                                                

merchant_name,products,revenue_level,take_rate,merchant_abn,category
Felis Limited,furniture home fu...,e,0.18,10023283211,3
Arcu Ac Orci Corp...,cable satellite a...,b,4.22,10142254217,2
Nunc Sed Company,jewelry watch clo...,b,4.4,10165489824,4
Ultricies Digniss...,watch clock and j...,b,3.29,10187291046,1
Enim Condimentum PC,music shops music...,a,6.33,10192359162,1
Fusce Company,gift card novelty...,a,6.34,10206519221,1
Aliquam Enim Inco...,computers compute...,b,4.32,10255988167,3
Ipsum Primis Ltd,watch clock and j...,c,2.39,10264435225,1
Pede Ultrices Ind...,computer programm...,a,5.71,10279061213,0
Nunc Inc.,furniture home fu...,a,6.61,10323485998,3


<h3>1.2 "transactions_20210228_20210827_snapshot"</h3>

In [15]:
transact.show(5)

+-------+------------+------------------+--------------------+--------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|
+-------+------------+------------------+--------------------+--------------+
|  18478| 62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|
|      2| 15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|
|  18479| 64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|
|      3| 60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|
|  18479| 94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|
+-------+------------+------------------+--------------------+--------------+
only showing top 5 rows



In [16]:
# separate pickup datetime into date, year, month and day
preprocessed_transact = transact.withColumn('order_year', year(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumn('order_month', month(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumn('order_day', dayofmonth(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumnRenamed('merchant_abn', 'merchant_abn_repeat')

In [17]:
preprocessed_transact.show(5)

+-------+-------------------+------------------+--------------------+--------------+----------+-----------+---------+
|user_id|merchant_abn_repeat|      dollar_value|            order_id|order_datetime|order_year|order_month|order_day|
+-------+-------------------+------------------+--------------------+--------------+----------+-----------+---------+
|  18478|        62191208634|63.255848959735246|949a63c8-29f7-4ab...|    2021-08-20|      2021|          8|       20|
|      2|        15549624934| 130.3505283105634|6a84c3cf-612a-457...|    2021-08-20|      2021|          8|       20|
|  18479|        64403598239|120.15860593212783|b10dcc33-e53f-425...|    2021-08-20|      2021|          8|       20|
|      3|        60956456424| 136.6785200286976|0f09c5a5-784e-447...|    2021-08-20|      2021|          8|       20|
|  18479|        94493496784| 72.96316578355305|f6c78c1a-4600-4c5...|    2021-08-20|      2021|          8|       20|
+-------+-------------------+------------------+--------

<h3>1.3 "tbl_consumer.csv"</h3>

In [18]:
tbl_consumer.show(5)

+-----------------+--------------------+-----+--------+------+-----------+
|             name|             address|state|postcode|gender|consumer_id|
+-----------------+--------------------+-----+--------+------+-----------+
| Yolanda Williams|413 Haney Gardens...|   WA|    6935|Female|    1195503|
|       Mary Smith|     3764 Amber Oval|  NSW|    2782|Female|     179208|
|    Jill Jones MD|  40693 Henry Greens|   NT|     862|Female|    1194530|
|  Lindsay Jimenez|00653 Davenport C...|  NSW|    2780|Female|     154128|
|Rebecca Blanchard|9271 Michael Mano...|   WA|    6355|Female|     712975|
+-----------------+--------------------+-----+--------+------+-----------+
only showing top 5 rows



In [19]:
preprocessed_tbl_consumer = tbl_consumer.withColumnRenamed("name", "consumer")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("address", "consumer_address")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("state", "consumer_state")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("postcode", "consumer_postcode")
preprocessed_tbl_consumer = preprocessed_tbl_consumer.withColumnRenamed("gender", "consumer_gender")

In [20]:
preprocessed_tbl_consumer.show(5)

+-----------------+--------------------+--------------+-----------------+---------------+-----------+
|         consumer|    consumer_address|consumer_state|consumer_postcode|consumer_gender|consumer_id|
+-----------------+--------------------+--------------+-----------------+---------------+-----------+
| Yolanda Williams|413 Haney Gardens...|            WA|             6935|         Female|    1195503|
|       Mary Smith|     3764 Amber Oval|           NSW|             2782|         Female|     179208|
|    Jill Jones MD|  40693 Henry Greens|            NT|              862|         Female|    1194530|
|  Lindsay Jimenez|00653 Davenport C...|           NSW|             2780|         Female|     154128|
|Rebecca Blanchard|9271 Michael Mano...|            WA|             6355|         Female|     712975|
+-----------------+--------------------+--------------+-----------------+---------------+-----------+
only showing top 5 rows



<h3>1.4 "consumer.parquet"</h3>

In [21]:
consumer.show(5)

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
|      3|    1194530|
|      4|     154128|
|      5|     712975|
+-------+-----------+
only showing top 5 rows



In [22]:
preprocessed_consumer = consumer.withColumnRenamed("user_id", "user_id_repeat")
preprocessed_consumer = preprocessed_consumer.withColumnRenamed("consumer_id", "consumer_id_repeat")

In [23]:
preprocessed_consumer.show(5)

+--------------+------------------+
|user_id_repeat|consumer_id_repeat|
+--------------+------------------+
|             1|           1195503|
|             2|            179208|
|             3|           1194530|
|             4|            154128|
|             5|            712975|
+--------------+------------------+
only showing top 5 rows



<h2>2. Join tables</h2>

In [24]:
preprocessed_tbl_merchant_cat.createOrReplaceTempView('merchant')
preprocessed_transact.createOrReplaceTempView('transact')
preprocessed_tbl_consumer.createOrReplaceTempView('tbl_consumer')
preprocessed_consumer.createOrReplaceTempView('consumer')

join = spark.sql("""
SELECT 
    *
FROM 
    merchant
INNER JOIN
    transact
ON 
    transact.merchant_abn_repeat = merchant.merchant_abn
INNER JOIN
    consumer
ON
    transact.user_id = consumer.user_id_repeat
INNER JOIN
    tbl_consumer
ON 
    consumer.consumer_id_repeat = tbl_consumer.consumer_id
ORDER BY
    revenue_level DESC
""")

join = join.drop("merchant_abn_repeat", "consumer_id_repeat", "user_id_repeat", "merchant_abn", "consumer_id", "order_id")
print(f"Mismatched (null) transactions dropped: {preprocessed_transact.count() - join.count()}")



Mismatched (null) transactions dropped: 580830


                                                                                

In [25]:
join

                                                                                

merchant_name,products,revenue_level,take_rate,category,user_id,dollar_value,order_datetime,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender
Nunc Est LLP,watch clock and j...,e,0.22,1,22603,43.591273089804616,2021-05-22,2021,5,22,Robert Duncan,6287 Shah Spur Su...,NSW,2449,Male
Lacus Mauris Asso...,shoe shops,e,0.48,4,22040,306.2836044736155,2021-06-07,2021,6,7,Janice Payne,397 Dudley Cliff ...,WA,6514,Female
Elit Consulting,antique shops sal...,e,0.47,0,22603,156.34258070283065,2022-07-23,2022,7,23,Robert Duncan,6287 Shah Spur Su...,NSW,2449,Male
Et Nunc Consulting,books periodicals...,e,0.16,4,9460,47.69273816802664,2021-05-22,2021,5,22,Carrie Mcguire,418 Shields Alley...,WA,6572,Female
Eget Metus Eu Ins...,shoe shops,e,0.42,4,22603,641.2971109480748,2022-07-23,2022,7,23,Robert Duncan,6287 Shah Spur Su...,NSW,2449,Male
Augue Eu Tempor A...,cable satellite a...,e,0.38,2,9460,46.65691800037062,2021-10-20,2021,10,20,Carrie Mcguire,418 Shields Alley...,WA,6572,Female
Nunc Est LLP,watch clock and j...,e,0.22,1,22603,72.9929882938332,2022-07-02,2022,7,2,Robert Duncan,6287 Shah Spur Su...,NSW,2449,Male
Lacus Mauris Asso...,shoe shops,e,0.48,4,722,310.20043945437243,2021-04-07,2021,4,7,Carol Kelley,79229 Tiffany Str...,NSW,2768,Female
Ante Industries,motor vehicle sup...,e,0.35,4,22603,264.8037618296113,2021-09-22,2021,9,22,Robert Duncan,6287 Shah Spur Su...,NSW,2449,Male
Augue Eu Tempor A...,cable satellite a...,e,0.38,2,722,111.10109077411116,2021-12-12,2021,12,12,Carol Kelley,79229 Tiffany Str...,NSW,2768,Female


In [26]:
join.write.parquet("../data/curated/raw_join_internal_table")

                                                                                