# Kmean segment ranking

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Categories")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "10g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

In [3]:
from pyspark.ml.feature import CountVectorizer, PCA, StopWordsRemover
from pyspark.sql.functions import lower, regexp_replace
from pyspark.ml.feature import Tokenizer

In [4]:
merchant = spark.read.parquet("../data/curated/part_1/clean_merchant.parquet")
goods = merchant.select("goods")

In [None]:
merchant.count()

In [None]:
# Step 1: Clean the 'goods' column by removing punctuation and converting to lowercase
clean_goods = goods.withColumn("str_goods", lower(regexp_replace("goods", "[^\w\s]", "")))

# Step 2: Tokenize the cleaned 'str_goods' column
tokenizer = Tokenizer(inputCol="str_goods", outputCol="tokens")
clean_goods = tokenizer.transform(clean_goods)

# Step 3: Get default stop words from StopWordsRemover
default_stopwords = StopWordsRemover.loadDefaultStopWords("english")

# Step 4: Add custom stop words to the list
additional_stopwords = ['new', 'shops', 'supplies', 'parts', 'services', '', 
                        'supply', 'integrated', 'equipment', 'sales', 
                        'dealers', 'restoration']
all_stopwords = default_stopwords + additional_stopwords

# Step 5: Remove stop words, including the additional custom words
remover = StopWordsRemover(inputCol="tokens", outputCol="clean_goods", stopWords=all_stopwords)
clean_goods = remover.transform(clean_goods)

# Step 6: Select only the 'clean_goods' column
clean_goods = clean_goods.select('clean_goods')

# Show the results
clean_goods.show(truncate=False)


In [None]:
clean_goods.show(5, truncate=False)

In [None]:
# Add an index to both DataFrames to ensure they align
from pyspark.sql.functions import monotonically_increasing_id

# Add an index column to df1
df1_with_index = merchant.withColumn("index", monotonically_increasing_id())

# Add an index column to df2
df2_with_index = clean_goods.withColumn("index", monotonically_increasing_id())

# Join the DataFrames on the index column
df_combined = df1_with_index.join(df2_with_index, on="index", how="inner").drop("index")

# Show the combined DataFrame
df_combined.show(truncate=False)

In [None]:
from pyspark.ml.feature import Word2Vec

# Step 1: Initialize the Word2Vec model
word2vec = Word2Vec(vectorSize=10, inputCol="clean_goods", outputCol="clean_goods_word2vec")

# Step 2: Train the Word2Vec model
model = word2vec.fit(df_combined)

# Step 3: Transform the 'clean_goods' column into word vectors
df_word2vec = model.transform(df_combined)

# Show the resulting DataFrame with word vectors
df_word2vec.select("clean_goods", "clean_goods_word2vec").show(truncate=False)

# K-Means Clustering

In [None]:
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans

# Step 1: Create a list to store the SSE values for each k
sse = []

# Step 2: Test K-Means with different values of k (e.g., from 2 to 10 clusters)
for k in range(2, 16):
    kmeans = KMeans(featuresCol="clean_goods_word2vec", predictionCol="cluster", k=k)
    kmeans_model = kmeans.fit(df_word2vec)
    
    # Compute the Sum of Squared Errors (SSE) and append to the list
    sse.append(kmeans_model.summary.trainingCost)

# Step 3: Plot the SSE against k to find the "elbow" point
plt.figure(figsize=(10,6))
plt.plot(range(2, 16), sse, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method For Optimal k')
plt.show()


In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol="clean_goods_word2vec", predictionCol="cluster", k=9)  # k is the number of clusters
kmeans_model = kmeans.fit(df_word2vec)

# Make predictions (assign clusters to the data points)
df_clusters = kmeans_model.transform(df_word2vec)

# Show the resulting DataFrame with clusters
df_clusters.select("clean_goods", "clean_goods_word2vec", "cluster").show(truncate=False)

In [None]:
from pyspark.sql.functions import explode, col, desc, count
from pyspark.sql import Window
from pyspark.sql.functions import row_number

# Step 1: Explode the 'clean_goods' column to individual clean_good entries
df_exploded = df_clusters.withColumn("clean_good", explode(col("clean_goods")))

# Step 2: Group by 'cluster' and 'clean_good', then count occurrences of each clean_good
df_grouped = df_exploded.groupBy("cluster", "clean_good").count()

# Step 3: Rank clean_goods within each cluster based on count
window = Window.partitionBy("cluster").orderBy(desc("count"))
df_ranked = df_grouped.withColumn("rank", row_number().over(window))

# Step 4: Filter to keep only the most common clean_good per cluster
df_most_common_good = df_ranked.filter(col("rank") == 1).drop("rank")

# Step 5: Show the most common clean_good for each cluster
df_most_common_good.select("cluster", "clean_good", "count").show(truncate=False)

# Step 6: Count the number of records (merchants) in each cluster
df_cluster_count = df_clusters.groupBy("cluster").agg(count("*").alias("merchant_count"))

# Step 7: Show the number of records/merchants in each cluster
df_cluster_count.show(truncate=False)


In [None]:
df_most_common_good.count()

In [None]:
df_clusters.filter(col('cluster')==3).show(20, truncate=False)

In [None]:
from pyspark.sql.functions import when, col

# Define the mapping of clusters to segments
df_segmented = df_clusters.withColumn(
    "segment",
    when(col("cluster") == 0, "Entertainment & Media") #music
    .when(col("cluster") == 1, "Office & Home Supplies") #office
    .when(col("cluster") == 2, "Office & Home Supplies") #furniture
    .when(col("cluster") == 3, "Miscellaneous") #novelty
    .when(col("cluster") == 4, "Fashion") #jewelry
    .when(col("cluster") == 5, "Entertainment & Media") #television
    .when(col("cluster") == 6, "Miscellaneous") #shoe
    .when(col("cluster") == 7, "Miscellaneous") #craft
    .when(col("cluster") == 8, "Technology") #computer
)

# Show the DataFrame with the assigned segments
df_segmented.select("cluster", "clean_goods", "segment").show(truncate=False)


In [35]:
df_segmented.select("segment", "merchant_abn", "goods").write.mode('overwrite').parquet("../data/curated/clean_merchant_segmented.parquet")

## Ranking with respect to each segment

In [36]:
initial_ranking = spark.read.parquet("../data/curated/merchant_ranking")
merchant_segment = spark.read.parquet("../data/curated/clean_merchant_segmented.parquet")

In [37]:
# Join each merchant in initial ranking by segment
merchant_segment_ranking = initial_ranking.join(merchant_segment, how='left', on='merchant_abn')

In [None]:
df_segmented.select("segment", "merchant_abn", "goods").write.mode('overwrite').parquet("../data/curated/clean_merchant_segmented.parquet")

## Ranking with respect to each segment

In [None]:
initial_ranking = spark.read.parquet("../data/curated/merchant_ranking")
merchant_segment = spark.read.parquet("../data/curated/clean_merchant_segmented.parquet")

In [None]:
# Join each merchant in initial ranking by segment
merchant_segment_ranking = initial_ranking.join(merchant_segment, how='left', on='merchant_abn')

In [None]:
merchant_segment_ranking.show()

In [None]:
segments = [
    "Entertainment & Media",
    "Office & Home Supplies",
    "Miscellaneous",
    "Fashion",
    "Technology"
]

for segment in segments:
    print(segment)
    segment_ranking = merchant_segment_ranking.filter(F.col('segment') == segment)
    segment_ranking.orderBy(F.col('final_score').desc()).show(10)