In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans

gold_path = "s3://customer-seg-project/gold_delta/"
df_gold = spark.read.format("delta").load(gold_path)

numeric_cols = [
    "total_spent",
    "avg_transaction_value",
    "fraud_rate",
    "total_transactions",
    "unique_channels"
]
df_ml = df_gold.na.fill({c: 0 for c in numeric_cols})

feature_cols = [
    "total_spent",
    "avg_transaction_value",
    "fraud_rate",
    "total_transactions",
    "unique_channels"
]
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw"
)
df_vector = assembler.transform(df_ml)

scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=False
)
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

kmeans = KMeans(
    featuresCol="features",
    predictionCol="customer_segment",
    k=4,
    seed=42
)
model = kmeans.fit(df_scaled)
df_clusters = model.transform(df_scaled)

columns_to_keep = df_gold.columns + ["features", "customer_segment"]
df_clusters = df_clusters.select(*columns_to_keep)

df_clusters.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("`final-dataset`")

display(df_clusters.limit(10))

customer_type,customer_age_group,city,merchant_cat,total_spent,total_transactions,unique_channels,fraud_transactions,avg_transaction_value,fraud_rate,features,customer_segment
Occasional,36-50,Pune,Electronics,7088.332805285103,344,4,11,20.61,0.032,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.622915023075894"",""6.530251160213636"",""4.572554997077528"",""0.6918043459777511"",""0.0""]}",0
Premium,36-50,Chennai,Pharmacy,14309.144757095182,471,4,11,30.38,0.0234,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.2574721703129377"",""9.625862700014084"",""3.3436808416129424"",""0.9472088574288394"",""0.0""]}",3
Regular,51+,Mumbai,Entertainment,11668.443589831237,606,4,13,19.25,0.0215,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.0254102068401885"",""6.099336964294639"",""3.072185388661464"",""1.2187018420422011"",""0.0""]}",0
Premium,18-25,Bengaluru,Pharmacy,9860.33951918882,486,4,14,20.29,0.0288,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.8665159760207686"",""6.428859584703283"",""4.115299497369775"",""0.9773747446081018"",""0.0""]}",0
Premium,26-35,Bengaluru,Grocery,19461.512148692917,985,4,30,19.76,0.0305,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.7102566459853512"",""6.260929787764264"",""4.358216481589519"",""1.9808932581048977"",""0.0""]}",0
Regular,51+,Pune,Fuel,4431.864992132613,212,4,4,20.91,0.0189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.3894675038092395"",""6.625305762254592"",""2.700665295148915"",""0.4263445388002419"",""0.0""]}",2
Regular,36-50,Pune,BillPay,12948.750004905549,573,4,7,22.6,0.0122,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.137922150339172"",""7.160780020418641"",""1.7432865926358077"",""1.1523368902478237"",""0.0""]}",2
Occasional,36-50,Delhi,Dining,16022.32689561621,687,4,13,23.32,0.0189,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.4080247643664152"",""7.388911065316933"",""2.700665295148915"",""1.381597632810218"",""0.0""]}",3
Occasional,26-35,Delhi,BillPay,19552.67214154624,939,4,20,20.82,0.0213,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.7182676875957974"",""6.5967893816423055"",""3.0436069199297298"",""1.8883845374218262"",""0.0""]}",0
Regular,51+,Kolkata,Fashion,5133.316806778887,182,4,3,28.21,0.0165,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.45111033087588376"",""8.938301078584507"",""2.3577236703681006"",""0.36601276444171715"",""0.0""]}",3
