# Project 2 - Group 13

## Tags ONLY cleaning and analysis

### Begin SPARK

In [1]:
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    #.config("spark.executor.memory", "8g")
    #.config("spark.driver.memory", "8g")
    .getOrCreate()
)

24/09/25 15:05:49 WARN Utils: Your hostname, MacBook.local resolves to a loopback address: 127.0.0.1; using 192.168.0.10 instead (on interface en0)
24/09/25 15:05:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 15:05:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/25 15:05:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Read in data

In [2]:
tags_df = spark.read.parquet('../data/tables/tables/tbl_merchants.parquet', inferSchema = True)
tags_df = tags_df.withColumnRenamed('name', 'merchant_name')
print(f'Number of entries = {tags_df.count()}')
tags_df.limit(5)

                                                                                

Number of entries = 4026


                                                                                

merchant_name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


### Clean up tags column

In [3]:
from pyspark.sql.functions import regexp_replace, lower

# Clean up the 'tags' column
tags_df = tags_df.withColumn('tags',
                             lower(regexp_replace(regexp_replace(tags_df['tags'], r'\(', '['), r'\)', ']'))
                            )

tags_df.show(truncate = False)

+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+
|merchant_name                       |tags                                                                                                             |merchant_abn|
+------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------+
|Felis Limited                       |[[furniture, home furnishings and equipment shops, and manufacturers, except appliances], [e], [take rate: 0.18]]|10023283211 |
|Arcu Ac Orci Corporation            |[[cable, satellite, and other pay television and radio services], [b], [take rate: 4.22]]                        |10142254217 |
|Nunc Sed Company                    |[[jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40]]                                          |10165489824 |
|Ult

In [4]:
from pyspark.sql.functions import split, regexp_replace, col

# Split 'tags' column into the 3 components: 'tags', 'revenue_level' and 'take_rate'
# 1. Rename column 'tags'
tags_df = tags_df.withColumnRenamed('tags', 'tags_array')

# 2. Remove [] on the 2 ends
tags_df = tags_df.withColumn('tags_array', regexp_replace(col('tags_array'), r'^\[|\]$', ''))

# 3. Split by ]
split_col = split(col('tags_array'), r'\]')

# 4. Assign split into new columns
tags_df = tags_df.withColumn('tags', split_col[0]) \
                 .withColumn('revenue_level', split_col[1]) \
                 .withColumn('take_rate', split_col[2])

tags_df = tags_df.drop('tags_array')
tags_df.show(truncate = False)

+------------------------------------+------------+--------------------------------------------------------------------------------------+-------------+------------------+
|merchant_name                       |merchant_abn|tags                                                                                  |revenue_level|take_rate         |
+------------------------------------+------------+--------------------------------------------------------------------------------------+-------------+------------------+
|Felis Limited                       |10023283211 |[furniture, home furnishings and equipment shops, and manufacturers, except appliances|, [e         |, [take rate: 0.18|
|Arcu Ac Orci Corporation            |10142254217 |[cable, satellite, and other pay television and radio services                        |, [b         |, [take rate: 4.22|
|Nunc Sed Company                    |10165489824 |[jewelry, watch, clock, and silverware shops                                          |, 

In [5]:
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.types import DoubleType

# Clean up all columns
# 1. Clean up 'tags': Remove unnecessary characters + remove ', and'
tags_df = tags_df.withColumn('tags', regexp_replace(col('tags'), r'\[', '')) \
                 .withColumn('tags', regexp_replace(col('tags'), r'\s+', ' ')) \
                 .withColumn('tags', regexp_replace(col('tags'), r'\s,', ',')) \
                 .withColumn('tags', regexp_replace(col('tags'), r', and', ','))

# 2. 'revenue_level': Only want the letter
tags_df = tags_df.withColumn('revenue_level', regexp_replace(col('revenue_level'), r'[^a-zA-Z]', ''))

# 3. 'take_rate': Only want the digits and cast to double
tags_df = tags_df.withColumn('take_rate', regexp_replace(col('take_rate'), r'[^0-9.]', ''))
tags_df = tags_df.withColumn('take_rate', col('take_rate').cast(DoubleType()))

tags_df.show(truncate = False)

24/09/25 15:06:07 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


+------------------------------------+------------+---------------------------------------------------------------------------------+-------------+---------+
|merchant_name                       |merchant_abn|tags                                                                             |revenue_level|take_rate|
+------------------------------------+------------+---------------------------------------------------------------------------------+-------------+---------+
|Felis Limited                       |10023283211 |furniture, home furnishings and equipment shops, manufacturers, except appliances|e            |0.18     |
|Arcu Ac Orci Corporation            |10142254217 |cable, satellite, other pay television and radio services                        |b            |4.22     |
|Nunc Sed Company                    |10165489824 |jewelry, watch, clock, silverware shops                                          |b            |4.4      |
|Ultricies Dignissim Lacus Foundation|10187291046 |w

In [6]:
tags_df.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



### Count number of each tag

In [8]:
tag_counts_df = tags_df.groupBy('tags').count()

tag_counts_df = tag_counts_df.orderBy('count', ascending = False)
tag_counts_df = tag_counts_df.withColumn('index', F.monotonically_increasing_id())

print("Number of unique tags =", tag_counts_df.count())
print("Total count (should be equal to number of merchants) =", tag_counts_df.agg(F.sum('count')).collect()[0][0])
tag_counts_df.show(n = tag_counts_df.count(), truncate = False)

Number of unique tags = 25
Total count (should be equal to number of merchants) = 4026
+---------------------------------------------------------------------------------+-----+-----+
|tags                                                                             |count|index|
+---------------------------------------------------------------------------------+-----+-----+
|digital goods: books, movies, music                                              |195  |0    |
|artist supply and craft shops                                                    |193  |1    |
|computer programming, data processing, integrated systems design services        |191  |2    |
|shoe shops                                                                       |185  |3    |
|gift, card, novelty, souvenir shops                                              |182  |4    |
|furniture, home furnishings and equipment shops, manufacturers, except appliances|182  |5    |
|computers, computer peripheral equipment, softwa

### Create industry segments

In [13]:
# Mapping of each tag to their respective segment
segment_mapping = {"retail": [0, 1, 3, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 18, 19],
                   "technology + media services": [2, 9, 22],
                   "health + beauty": [14, 17],
                   "luxury products + services": [10, 21, 23, 24],
                   "rental services": [20]
                   }

# Assign segments
segment_conditions = F.when(col("index").isin(segment_mapping["retail"]), "retail") \
                      .when(col("index").isin(segment_mapping["technology + media services"]), "technology + media services") \
                      .when(col("index").isin(segment_mapping["health + beauty"]), "health + beauty") \
                      .when(col("index").isin(segment_mapping["luxury products + services"]), "luxury products + services") \
                      .when(col("index").isin(segment_mapping["rental services"]), "rental services") \
                      .otherwise("Unknown")

tag_counts_df = tag_counts_df.withColumn("segment", segment_conditions)
tag_counts_df.show(n = tag_counts_df.count(), truncate = False)

                                                                                

+---------------------------------------------------------------------------------+-----+-----+---------------------------+
|tags                                                                             |count|index|segment                    |
+---------------------------------------------------------------------------------+-----+-----+---------------------------+
|digital goods: books, movies, music                                              |195  |0    |retail                     |
|artist supply and craft shops                                                    |193  |1    |retail                     |
|computer programming, data processing, integrated systems design services        |191  |2    |technology + media services|
|shoe shops                                                                       |185  |3    |retail                     |
|gift, card, novelty, souvenir shops                                              |182  |4    |retail                     |
|furnitu

### Merge with tags_df

In [15]:
tag_counts_df = tag_counts_df.withColumnRenamed("segment", "industry_segment")

# Join ON 'tags'
tags_df = tags_df.join(tag_counts_df, on = "tags", how = "left")

tags_df = tags_df.drop('count')
tags_df = tags_df.drop('index')

tags_df.show(truncate = False)

+---------------------------------------------------------------------------------+------------------------------------+------------+-------------+---------+---------------------------+
|tags                                                                             |merchant_name                       |merchant_abn|revenue_level|take_rate|industry_segment           |
+---------------------------------------------------------------------------------+------------------------------------+------------+-------------+---------+---------------------------+
|furniture, home furnishings and equipment shops, manufacturers, except appliances|Felis Limited                       |10023283211 |e            |0.18     |retail                     |
|cable, satellite, other pay television and radio services                        |Arcu Ac Orci Corporation            |10142254217 |b            |4.22     |technology + media services|
|jewelry, watch, clock, silverware shops                              

## MERGE with TRANSACTIONS

In [17]:
trans_final_sdf = spark.read.parquet('../data/merged/merged_transactions_with_outliers.parquet')
trans_final_sdf = trans_final_sdf.drop('tags')

print("Number of entries =", trans_final_sdf.count())
trans_final_sdf.show(5)

                                                                                

Number of entries = 13614854


[Stage 106:>                                                        (0 + 1) / 1]

+-------+------------+------------------+--------------------+--------------+--------------------+-----------+----------------+--------------------+-----+--------+-----------+--------------------------+--------------------------+-------------+-------------+--------+----------------+----------------------+----------------+------------+--------------------+--------------------------+--------------------+--------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|       merchant_name|consumer_id|            name|             address|state|postcode|     gender|consumer_fraud_probability|merchant_fraud_probability|LOCALITY_NAME|list_SA2_CODE|list_ERP|list_num_earners|list_total_income_($M)|list_mean_income|postcode_ERP|postcode_num_earners|postcode_total_income_($M)|postcode_mean_income|is_fraud|
+-------+------------+------------------+--------------------+--------------+--------------------+-----------+----------------+--------------------+-----+--------+-------

                                                                                

In [20]:
trans_final_sdf = trans_final_sdf.join(F.broadcast(tags_df),
                                       trans_final_sdf.merchant_abn == tags_df.merchant_abn,
                                       how = 'left'
                                       ).drop(tags_df.merchant_abn, tags_df.merchant_name)

print("Number of entries =", trans_final_sdf.count())
trans_final_sdf.show(5)

                                                                                

Number of entries = 13614854


[Stage 145:>                                                        (0 + 1) / 1]

+-------+------------+------------------+--------------------+--------------+--------------------+-----------+----------------+--------------------+-----+--------+-----------+--------------------------+--------------------------+-------------+-------------+--------+----------------+----------------------+----------------+------------+--------------------+--------------------------+--------------------+--------+--------------------+-------------+---------+----------------+--------------------+--------------------+------------+-------------+---------+----------------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|       merchant_name|consumer_id|            name|             address|state|postcode|     gender|consumer_fraud_probability|merchant_fraud_probability|LOCALITY_NAME|list_SA2_CODE|list_ERP|list_num_earners|list_total_income_($M)|list_mean_income|postcode_ERP|postcode_num_earners|postcode_total_income_($M)|postcode_mean_income|is_fraud|             

                                                                                

In [19]:
trans_final_sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- consumer_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- consumer_fraud_probability: double (nullable = true)
 |-- merchant_fraud_probability: double (nullable = true)
 |-- LOCALITY_NAME: string (nullable = true)
 |-- list_SA2_CODE: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- list_ERP: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- list_num_earners: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- list_total_income_($M): array (nullable = true)
 |    |-- element: d