In [37]:
import os
import sys
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("YourAppName")  \
    .config("spark.executor.memory", "6g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

merchants = spark.read.parquet("../data/curated/curated_merchants")
merchants = merchants.drop("name", "rev_level", "take_rate")
merchants.show(3)

+------------+--------------------+
|merchant_abn|                type|
+------------+--------------------+
| 10023283211|furniture, home f...|
| 10142254217|cable, satellite,...|
| 10187291046|watch, clock, and...|
+------------+--------------------+
only showing top 3 rows



In [38]:
import nltk
from pyspark.sql.functions import udf, split
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import StringType
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK stopwords and WordNet data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') 

# Preprocessing function
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(x):
    if x:
        # Lowercase the text
        x = x.lower()
        # Remove punctuation
        x = x.translate(str.maketrans('', '', string.punctuation))
        # Lemmatize words
        x = " ".join([lemmatizer.lemmatize(word) for word in x.split()])
        # Remove stop words
        x = ' '.join([word for word in x.split() if word not in STOPWORDS])
    return x

# Register as UDF (User Defined Function)
preprocess_udf = udf(preprocess_text, StringType())

# Read the merchant data
merchants = spark.read.parquet("../data/curated/curated_merchants")

# Apply preprocessing to 'type' column
merchants_cleaned = merchants.withColumn("type_clean", preprocess_udf(merchants['type']))

# Convert the `type_clean` column to an array of words (necessary for CountVectorizer)
merchants_cleaned = merchants_cleaned.withColumn("type_clean", split(merchants_cleaned["type_clean"], " "))

# Now apply CountVectorizer for one-hot encoding
vectorizer = CountVectorizer(inputCol="type_clean", outputCol="type_encoded", binary=True)
vectorized_model = vectorizer.fit(merchants_cleaned)
merchants_vectorized = vectorized_model.transform(merchants_cleaned)

# Show final output
merchants_vectorized.select('merchant_abn', 'type_clean', 'type_encoded').show(truncate=False)

[nltk_data] Downloading package stopwords to /Users/ngoc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ngoc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ngoc/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
                                                                                

+------------+-------------------------------------------------------------------------------+--------------------------------------------------------------+
|merchant_abn|type_clean                                                                     |type_encoded                                                  |
+------------+-------------------------------------------------------------------------------+--------------------------------------------------------------+
|10023283211 |[furniture, home, furnishing, equipment, shop, manufacturer, except, appliance]|(87,[0,5,9,10,27,30,31,33],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]) |
|10142254217 |[cable, satellite, pay, television, radio, service]                            |(87,[2,42,43,44,45,46],[1.0,1.0,1.0,1.0,1.0,1.0])             |
|10187291046 |[watch, clock, jewelry, repair, shop]                                          |(87,[0,12,13,14,15],[1.0,1.0,1.0,1.0,1.0])                    |
|10192359162 |[music, shop, musical, instrument, pia

In [39]:
# Obtain the vocabulary from the CountVectorizer model
vocabulary = vectorized_model.vocabulary

# Extract both 'merchant_abn' and 'type_encoded' (dense vector)
merchant_data = merchants_vectorized.select("merchant_abn", "type_encoded").rdd.map(lambda row: (row.merchant_abn, row.type_encoded.toArray())).collect()

# Create separate lists for 'merchant_abn' and the dense vectors
merchant_abn_list = [row[0] for row in merchant_data]
dense_vectors = [row[1] for row in merchant_data]

# Create a DataFrame for the dense vectors (with columns as vocabulary)
dense_df = pd.DataFrame(dense_vectors, columns=vocabulary)

# Add the 'merchant_abn' column to the DataFrame
dense_df['merchant_abn'] = merchant_abn_list

# Drop the words as they are too general or business related
words_to_drop = ['shop', 'service', 'supply', 'sale']
dense_df = dense_df.drop(columns=[word for word in words_to_drop if word in dense_df.columns])

# Show the resulting DataFrame
dense_df.head()

                                                                                

Unnamed: 0,computer,music,equipment,book,good,nursery,appliance,furniture,repair,jewelry,...,leasing,al,telecom,gallery,dealer,art,antique,restoration,silverware,merchant_abn
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10023283211
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10142254217
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10187291046
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10192359162
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10206519221


In [51]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd

# Assuming dense_df is already created and contains merchant_abn and the dense word vectors
df = dense_df.drop('merchant_abn', axis=1)  # Drop 'merchant_abn' for clustering, keep only word vectors

# Number of clusters 
n_clusters = 12

# Fit K-Means clustering
km = KMeans(n_clusters=n_clusters, random_state=0)
km.fit(df)

# Add the cluster labels to the DataFrame
dense_df['cluster_num'] = km.labels_

# Create final DataFrame with 'merchant_abn' and 'cluster_num'
final_df = dense_df[['merchant_abn', 'cluster_num']]

# Show the final DataFrame
print(final_df)

      merchant_abn  cluster_num
0      10023283211            6
1      10142254217            7
2      10187291046            4
3      10192359162           11
4      10206519221            8
...            ...          ...
3925   99938978285            1
3926   99974311662            7
3927   99976658299            7
3928   99987905597            7
3929   99990536339            7

[3930 rows x 2 columns]


In [52]:
# Group by 'cluster_num' and sum the one-hot encoded word vectors to get word frequencies
cluster_word_freq = dense_df.drop('merchant_abn', axis=1).groupby('cluster_num').sum()

# Transpose the DataFrame for better readability (clusters as rows, words as columns)
cluster_word_freq = cluster_word_freq.T

# Show the most common words in each cluster
for cluster in cluster_word_freq.columns:
    print(f"\nMost common words in cluster {cluster}:")
    top_words = cluster_word_freq[cluster].sort_values(ascending=False).head(10)
    print(top_words)


Most common words in cluster 0:
outlet        153.0
including     153.0
lawn          153.0
garden        153.0
nursery       153.0
newspaper       0.0
office          0.0
stationery      0.0
printing        0.0
writing         0.0
Name: 0, dtype: float64

Most common words in cluster 1:
good          346.0
movie         195.0
book          195.0
music         195.0
digital       195.0
optician      151.0
optical       151.0
eyeglass      151.0
beauty          0.0
stationery      0.0
Name: 1, dtype: float64

Most common words in cluster 2:
toy           142.0
game          142.0
hobby         142.0
computer        0.0
outlet          0.0
lawn            0.0
garden          0.0
office          0.0
stationery      0.0
printing        0.0
Name: 2, dtype: float64

Most common words in cluster 3:
health        164.0
beauty        164.0
spa           164.0
computer        0.0
writing         0.0
including       0.0
lawn            0.0
garden          0.0
office          0.0
stationery      