## This notebook is to clean up the 'tags' column

### 0 - Import neccesary modules and begin spark

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import DoubleType

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/10/01 19:18:13 WARN Utils: Your hostname, AshwinHPEnvy resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/10/01 19:18:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/01 19:18:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/01 19:18:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### 1. Clean up 'tags' column

#### 1.1 - Load in data

In [2]:
# Load in data after outliers analysis
tags_df = spark.read.parquet('../data/merged/merged_transactions_with_outliers.parquet')

                                                                                

#### 1.2 - Lower case and change brackets

In [3]:
# Lower case and change () to [] using regular expression
tags_df = tags_df.withColumn('tags',
                             F.lower(F.regexp_replace(F.regexp_replace(tags_df['tags'], \
                                                                       r'\(', '['), r'\)', ']')))

tags_df.show(truncate = False)

24/10/01 19:18:27 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------+------------------+------------------------------------+--------------+-------------------------------------+------------------------------------------------------------------------------------------+-----------+-----------------+-----------------------------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                        |tags                                                                                      |consumer_id|name             |address                      |state|

#### 1.3 - Split 'tags' column into 3 components: 'description', 'revenue_band' and 'take_rate'

In [4]:
# 1. Remove [] on the 2 ends
tags_df = tags_df.withColumn('tags', F.regexp_replace(F.col('tags'), r'^\[|\]$', ''))

# 2. Split by ]
split_col = F.split(F.col('tags'), r'\]')

# 3. Assign split into new columns
tags_df = tags_df.withColumn('description', split_col[0]) \
                 .withColumn('revenue_band', split_col[1]) \
                 .withColumn('take_rate', split_col[2])

# drop big and obselete 'tags' column
tags_df = tags_df.drop('tags')
tags_df.show(truncate = False)

[Stage 2:>                                                          (0 + 1) / 1]

+-------+------------+------------------+------------------------------------+--------------+-------------------------------------+-----------+-----------------+-----------------------------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+---------------------------------------------------------------+------------+------------------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                        |consumer_id|name             |address                      |state|postcode|gender     |consumer_fraud_probability|merchant_fraud_probability|LOCALITY_NA

                                                                                

##### 1.3.1 - Clean up the 3 new columns

In [5]:
# 1. Clean up 'description': Remove unnecessary characters, ', and'
tags_df = tags_df.withColumn('description', F.regexp_replace(F.col('description'), r'\[', '')) \
                 .withColumn('description', F.regexp_replace(F.col('description'), r'\s+', ' ')) \
                 .withColumn('description', F.regexp_replace(F.col('description'), r'\s,', ',')) \
                 .withColumn('description', F.regexp_replace(F.col('description'), r', and', ','))

# 2. 'revenue_band': Retain only the letter
tags_df = tags_df.withColumn('revenue_band', F.regexp_replace(F.col('revenue_band'), r'[^a-zA-Z]', ''))

# 3. 'take_rate': Retain only the digits, and cast to double
tags_df = tags_df.withColumn('take_rate', F.regexp_replace(F.col('take_rate'), r'[^0-9.]', ''))
tags_df = tags_df.withColumn('take_rate', F.col('take_rate').cast(DoubleType()))

tags_df.show(truncate = False)

[Stage 3:>                                                          (0 + 1) / 1]

+-------+------------+------------------+------------------------------------+--------------+-------------------------------------+-----------+-----------------+-----------------------------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+---------------------------------------------------------+------------+---------+
|user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                        |consumer_id|name             |address                      |state|postcode|gender     |consumer_fraud_probability|merchant_fraud_probability|LOCALITY_NAME |list_SA2_CO

                                                                                

##### 1.3.2 - Count the number of unique description

In [6]:
# Count the number of each description
tag_counts_df = tags_df.groupBy('description').count()

tag_counts_df = tag_counts_df.orderBy('count', ascending = False)  #  order by the count

#  create index column to allocate to segments later
tag_counts_df = tag_counts_df.withColumn('index', F.monotonically_increasing_id())  

print("Number of unique tags =", tag_counts_df.count())
print("Total count (should be equal to number of transactions) =", tag_counts_df.agg(F.sum('count')).collect()[0][0])
tag_counts_df.show(n = tag_counts_df.count(), truncate = False)

                                                                                

Number of unique tags = 25


                                                                                

Total count (should be equal to number of transactions) = 13614854


                                                                                

+---------------------------------------------------------------------------------+-------+-----+
|description                                                                      |count  |index|
+---------------------------------------------------------------------------------+-------+-----+
|tent and awning shops                                                            |1816901|0    |
|gift, card, novelty, souvenir shops                                              |1640025|1    |
|digital goods: books, movies, music                                              |1283763|2    |
|watch, clock, jewelry repair shops                                               |1031486|3    |
|cable, satellite, other pay television and radio services                        |957072 |4    |
|florists supplies, nursery stock, flowers                                        |881774 |5    |
|opticians, optical goods, eyeglasses                                             |843893 |6    |
|computers, computer

### 2 - Create industry segments

In [7]:
# CHANGE THIS MAPPING IN IF SEGMENTS CHANGE

# Map descriptions to their respective segments
segment_mapping = {"retail": [0, 1, 2, 5, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 20],
                   "technology + media services": [4, 8, 22],
                   "health + beauty": [6, 16],
                   "luxury products + services": [3, 19, 23, 24],
                   "rental services": [21]
                   }

# Assign the descriptions to segments using the index mappings
segment_conditions = F.when(F.col("index").isin(segment_mapping["retail"]), "retail")\
.when(F.col("index").isin(segment_mapping["technology + media services"]), "technology + media services")\
.when(F.col("index").isin(segment_mapping["health + beauty"]), "health + beauty")\
.when(F.col("index").isin(segment_mapping["luxury products + services"]), "luxury products + services")\
.when(F.col("index").isin(segment_mapping["rental services"]), "rental services")\
.otherwise("Unknown")

tag_counts_df = tag_counts_df.withColumn("segment", segment_conditions)
tag_counts_df.show(n = tag_counts_df.count(), truncate = False)

                                                                                

+---------------------------------------------------------------------------------+-------+-----+---------------------------+
|description                                                                      |count  |index|segment                    |
+---------------------------------------------------------------------------------+-------+-----+---------------------------+
|tent and awning shops                                                            |1816901|0    |retail                     |
|gift, card, novelty, souvenir shops                                              |1640025|1    |retail                     |
|digital goods: books, movies, music                                              |1283763|2    |retail                     |
|watch, clock, jewelry repair shops                                               |1031486|3    |luxury products + services |
|cable, satellite, other pay television and radio services                        |957072 |4    |technology + media se

### 3 - Merge the two tables on 'description' column

In [8]:
tag_counts_df = tag_counts_df.withColumnRenamed("segment", "industry_segment")

# Join ON 'description'
tags_df = tags_df.join(tag_counts_df, on = "description", how = "left")

tags_df = tags_df.drop('count')
tags_df = tags_df.drop('index')

tags_df.show(truncate = False)

                                                                                

+---------------------------------------------------------+-------+------------+------------------+------------------------------------+--------------+-------------------------------------+-----------+-----------------+-----------------------------+-----+--------+-----------+--------------------------+--------------------------+--------------+------------------------------------------------------------------+---------------------------------------+------------------------------------+------------------------------------------+------------------------------------------------------+------------+--------------------+--------------------------+--------------------+--------+------------+---------+---------------------------+
|description                                              |user_id|merchant_abn|dollar_value      |order_id                            |order_datetime|merchant_name                        |consumer_id|name             |address                      |state|postcode|gender

### 4 - Save data

In [9]:
tags_df.write.mode("overwrite").parquet("../data/merged/merged_transactions_with_tags.parquet")

                                                                                