### Customer Segmentation Model using K-Means Clustering
Machine learning (K-Means clustering) to segment customers based on their purchase patterns such as frequency, total spend, and item count. This allows us to identify high-value customers, analyze behavior, and build targeted strategies for retention and upselling.

In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import *

In [0]:
gold_df = spark.table("democatalog.gold.order_summary_monthly")
display(gold_df)

In [0]:
feature_cols = ["total_orders", "total_items_bought", "total_amount_spent"]
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_vector")
df_vector = assembler.transform(gold_df)


In [0]:
scaler = StandardScaler(inputCol="features_vector", outputCol="scaled_features", withStd=True, withMean=False)
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)


In [0]:
kmeans = KMeans(featuresCol='scaled_features', predictionCol='customer_segment', k=3)  # k=3 means 3 clusters
model = kmeans.fit(df_scaled)

clustered_df = model.transform(df_scaled)
display(clustered_df)


In [0]:
centers = model.clusterCenters()
for i, center in enumerate(centers):
    print(f"Cluster {i}: {center}")


In [0]:
clustered_df.write.format("delta").mode("overwrite").saveAsTable("gold_customer_segments")