In [1]:
from pyspark.sql import functions as F, SparkSession

import os
os.sys.path.append("../")
from scripts.consumer_model import *

from pyspark.sql import functions as F
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover




In [2]:
# Create a Spark Session
spark = (
    SparkSession.builder.appName("segments")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

24/09/29 20:51:10 WARN Utils: Your hostname, qinsitaodeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.4.51 instead (on interface en0)
24/09/29 20:51:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/29 20:51:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
merchants_info = spark.read.parquet('../data/curated/merchant_info.parquet')

                                                                                

In [4]:
merchants_info.printSchema()

root
 |-- name: string (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- category: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



In [5]:
distinct_category = merchants_info.groupBy("category").count()

In [6]:
distinct_category.show(100, truncate=False)

                                                                                

+---------------------------------------------------------------------------------------+-----+
|category                                                                               |count|
+---------------------------------------------------------------------------------------+-----+
|stationery, office  supplies and printing and writing paper                            |2    |
|health  and beauty spas                                                                |1    |
|opticians, optical goods, and eyeglasses                                               |145  |
|books, periodicals, and  newspapers                                                    |5    |
|watch, clock, and jewelry repair shops                                                 |159  |
|computer programming , data processing, and integrated systems design services         |182  |
|jewelry, watch, clock, and  silverware shops                                           |2    |
|tent and  awning shops                 

# Data preprocessing

In [7]:

# Lowercase all text
merchants_info = merchants_info.withColumn("category_clean", F.lower(F.col("category")))

# Tokenize the descriptions (split text into words)
tokenizer = RegexTokenizer(inputCol="category_clean", outputCol="tokens", pattern="\\W")
merchants_info = tokenizer.transform(merchants_info)

# Remove stop words
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
merchants_info = stopwords_remover.transform(merchants_info)


24/09/29 20:51:21 WARN StopWordsRemover: Default locale set was [en_CN]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [8]:
preprocessing_distinct_category = merchants_info.groupBy("filtered_tokens").count()
preprocessing_distinct_category.show(50,truncate=False)

+-----------------------------------------------------------------------------------+-----+
|filtered_tokens                                                                    |count|
+-----------------------------------------------------------------------------------+-----+
|[lawn, garden, supply, outlets, including, nurseries]                              |153  |
|[tent, awning, shops]                                                              |178  |
|[hobby, toy, game, shops]                                                          |142  |
|[equipment, tool, furniture, appliance, rent, al, leasing]                         |134  |
|[shoe, shops]                                                                      |185  |
|[furniture, home, furnishings, equipment, shops, manufacturers, except, appliances]|182  |
|[watch, clock, jewelry, repair, shops]                                             |170  |
|[artist, supply, craft, shops]                                                 

# Text classification

In [10]:
# Manually classify all description into 5 classes
segments_dict = {
    "Electronics and Technology": [
        ['computers', 'computer', 'peripheral', 'equipment', 'software'],
        ['computer', 'programming', 'data', 'processing', 'integrated', 'systems', 'design', 'services'],
        ['telecom'],
        ['cable', 'satellite', 'pay', 'television', 'radio', 'services']
    ],
    "Health and Beauty": [
        ['opticians', 'optical', 'goods', 'eyeglasses'],
        ['health', 'beauty', 'spas'],
        ['florists', 'supplies', 'nursery', 'stock', 'flowers']
    ],
    "Home and Garden Supplies": [
        ['lawn', 'garden', 'supply', 'outlets', 'including', 'nurseries'],
        ['tent', 'awning', 'shops'],
        ['equipment', 'tool', 'furniture', 'appliance', 'rent', 'al', 'leasing'],
        ['furniture', 'home', 'furnishings', 'equipment', 'shops', 'manufacturers', 'except', 'appliances'],
        ['books', 'periodicals', 'newspapers']
    ],
    "Retail and Gifts": [
        ['hobby', 'toy', 'game', 'shops'],
        ['artist', 'supply', 'craft', 'shops'],
        ['stationery', 'office', 'supplies', 'printing', 'writing', 'paper'],
        ['digital', 'goods', 'books', 'movies', 'music'],
        ['books', 'periodicals', 'newspapers'],
        ['music', 'shops', 'musical', 'instruments', 'pianos', 'sheet', 'music'],
        ['art', 'dealers', 'galleries'],
        ['gift', 'card', 'novelty', 'souvenir', 'shops'],
        ["shoe", "shops"] 
    ],
    "Automotive and Repair Services": [
        ['watch', 'clock', 'jewelry', 'repair', 'shops'],
        ['motor', 'vehicle', 'supplies', 'new', 'parts'],
        ['antique', 'shops', 'sales', 'repairs', 'restoration', 'services'],
        ['bicycle', 'shops', 'sales', 'service'],
        ['jewelry', 'watch', 'clock', 'silverware', 'shops']
    ]
}

segment_conditions = [
    (F.col("filtered_tokens").cast("string") == F.array([F.lit(item) for item in value]).cast("string"), F.lit(key))
    for key, values in segments_dict.items() for value in values
]

24/09/29 20:51:22 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [11]:
# Add the 'segments' column based on the dictionary
segments_column = F.lit(None)
for condition, segment in segment_conditions:
    segments_column = F.when(condition, segment).otherwise(segments_column)
merchants_info = merchants_info.withColumn("segments", segments_column)


In [12]:
merchants_info.show(3, truncate=False)

+------------------------+------------+-------------------------------------------------------------------------------------+-------------+---------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+------------------------------+
|name                    |merchant_abn|category                                                                             |revenue_level|take_rate|category_clean                                                                       |tokens                                                                                       |filtered_tokens                                                                    |segments                      |
+------------------------+------------+-----------------------------------------------------------------------

In [13]:
# Number of merchants for each classes
num_merchants = merchants_info.groupBy("segments").count()
num_merchants.show(500, truncate=False)

+------------------------------+-----+
|segments                      |count|
+------------------------------+-----+
|Retail and Gifts              |1501 |
|Home and Garden Supplies      |647  |
|Health and Beauty             |495  |
|Automotive and Repair Services|711  |
|Electronics and Technology    |672  |
+------------------------------+-----+



# Save merchants with their segments

In [15]:
merchants_info

4026