# Cleaning up tags column

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import DoubleType

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/09/25 20:15:49 WARN Utils: Your hostname, LAPTOP-MMSL4U2D resolves to a loopback address: 127.0.1.1; using 172.17.187.27 instead (on interface eth0)
24/09/25 20:15:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 20:15:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Load in data after outliers analysis
tags_df = spark.read.parquet('../data/merged/merged_transactions_with_outliers.parquet')

In [7]:
# Clean up tags column
tags_df = tags_df.withColumn('tags',
                             F.lower(F.regexp_replace(F.regexp_replace(tags_df['tags'], r'\(', '['), r'\)', ']'))
                            )

tags_df.show(truncate = False)

[Stage 4:>                                                          (0 + 1) / 1]

+-------+------------+------------------+------------------------------------+--------------+-------------------------------+-----------------------------------------------------------------------------------------------------------------+-----------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                  |tags                                                                                                             |consumer_id|state|postcode|gender     |consumer_fraud_probability|merchant_fraud

                                                                                

In [9]:
# Split 'tags' column into the 3 components: 'tags', 'revenue_level' and 'take_rate'

# 1. Remove [] on the 2 ends
tags_df = tags_df.withColumn('tags', F.regexp_replace(F.col('tags'), r'^\[|\]$', ''))

# 2. Split by ]
split_col = F.split(F.col('tags'), r'\]')

# 3. Assign split into new columns
tags_df = tags_df.withColumn('description', split_col[0]) \
                 .withColumn('revenue_band', split_col[1]) \
                 .withColumn('take_rate', split_col[2])

tags_df = tags_df.drop('tags')
tags_df.show(truncate = False)

[Stage 5:>                                                          (0 + 1) / 1]

+-------+------------+------------------+------------------------------------+--------------+-------------------------------+-----------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+--------------------------------------------------------------------------------------+------------+------------------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                  |consumer_id|state|postcode|gender     |consumer_fraud_probability|merchant_fraud_probability|LOCALITY_NAME |list_SA2_CODE                                                     |list_ERP      

                                                                                

In [11]:
# Clean up all columns
# 1. Clean up 'tags': Remove unnecessary characters + remove ', and'
tags_df = tags_df.withColumn('description', F.regexp_replace(F.col('description'), r'\[', '')) \
                 .withColumn('description', F.regexp_replace(F.col('description'), r'\s+', ' ')) \
                 .withColumn('description', F.regexp_replace(F.col('description'), r'\s,', ',')) \
                 .withColumn('description', F.regexp_replace(F.col('description'), r', and', ','))

# 2. 'revenue_level': Only want the letter
tags_df = tags_df.withColumn('revenue_band', F.regexp_replace(F.col('revenue_band'), r'[^a-zA-Z]', ''))

# 3. 'take_rate': Only want the digits and cast to double
tags_df = tags_df.withColumn('take_rate', F.regexp_replace(F.col('take_rate'), r'[^0-9.]', ''))
tags_df = tags_df.withColumn('take_rate', F.col('take_rate').cast(DoubleType()))

tags_df.show(truncate = False)

[Stage 6:>                                                          (0 + 1) / 1]

+-------+------------+------------------+------------------------------------+--------------+-------------------------------+-----------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+---------------------------------------------------------------------------------+------------+---------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                  |consumer_id|state|postcode|gender     |consumer_fraud_probability|merchant_fraud_probability|LOCALITY_NAME |list_SA2_CODE                                                     |list_ERP                    

                                                                                

In [14]:
# Count number of each description
tag_counts_df = tags_df.groupBy('description').count()

tag_counts_df = tag_counts_df.orderBy('count', ascending = False)
tag_counts_df = tag_counts_df.withColumn('index', F.monotonically_increasing_id())

print("Number of unique tags =", tag_counts_df.count())
print("Total count (should be equal to number of transactions) =", tag_counts_df.agg(F.sum('count')).collect()[0][0])
tag_counts_df.show(n = tag_counts_df.count(), truncate = False)

                                                                                

Number of unique tags = 25


                                                                                

Total count (should be equal to number of transactions) = 13614854




+---------------------------------------------------------------------------------+-------+-----+
|description                                                                      |count  |index|
+---------------------------------------------------------------------------------+-------+-----+
|tent and awning shops                                                            |1816901|0    |
|gift, card, novelty, souvenir shops                                              |1640025|1    |
|digital goods: books, movies, music                                              |1283763|2    |
|watch, clock, jewelry repair shops                                               |1031486|3    |
|cable, satellite, other pay television and radio services                        |957072 |4    |
|florists supplies, nursery stock, flowers                                        |881774 |5    |
|opticians, optical goods, eyeglasses                                             |843893 |6    |
|computers, computer

                                                                                

### Create industry segments

In [15]:
# CHANGE THIS MAPPING IN IF SEGMENTS CHANGE

# Mapping of each tag to their respective segment
segment_mapping = {"retail": [0, 1, 2, 5, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 20],
                   "technology + media services": [4, 8, 22],
                   "health + beauty": [6, 16],
                   "luxury products + services": [3, 19, 23, 24],
                   "rental services": [21]
                   }

# Assign segments
segment_conditions = F.when(F.col("index").isin(segment_mapping["retail"]), "retail") \
                      .when(F.col("index").isin(segment_mapping["technology + media services"]), "technology + media services") \
                      .when(F.col("index").isin(segment_mapping["health + beauty"]), "health + beauty") \
                      .when(F.col("index").isin(segment_mapping["luxury products + services"]), "luxury products + services") \
                      .when(F.col("index").isin(segment_mapping["rental services"]), "rental services") \
                      .otherwise("Unknown")

tag_counts_df = tag_counts_df.withColumn("segment", segment_conditions)
tag_counts_df.show(n = tag_counts_df.count(), truncate = False)

                                                                                

+---------------------------------------------------------------------------------+-------+-----+---------------------------+
|description                                                                      |count  |index|segment                    |
+---------------------------------------------------------------------------------+-------+-----+---------------------------+
|tent and awning shops                                                            |1816901|0    |retail                     |
|gift, card, novelty, souvenir shops                                              |1640025|1    |retail                     |
|digital goods: books, movies, music                                              |1283763|2    |retail                     |
|watch, clock, jewelry repair shops                                               |1031486|3    |luxury products + services |
|cable, satellite, other pay television and radio services                        |957072 |4    |technology + media se

### Merge with tags_df

In [17]:
tag_counts_df = tag_counts_df.withColumnRenamed("segment", "industry_segment")

# Join ON 'description'
tags_df = tags_df.join(tag_counts_df, on = "description", how = "left")

tags_df = tags_df.drop('count')
tags_df = tags_df.drop('index')

tags_df.show(truncate = False)

                                                                                

+---------------------------------------------------------------------------------+-------+------------+------------------+------------------------------------+--------------+-------------------------------+-----------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+------------+---------+---------------------------+
|description                                                                      |user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                  |consumer_id|state|postcode|gender     |consumer_fraud_probability|merchant_fraud_probability|

In [19]:
# Save this dataframe
tags_df.write.mode("overwrite").parquet("../data/merged/merged_transactions_with_tags.parquet")

                                                                                