In [0]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler

# Load cleaned dataset
df = spark.table("default.customer_data_final_cleaned")

print("✅ Loaded table: customer_data_final_cleaned")
df.printSchema()

# --------------------------
# 1. Standardize Date Formats
# --------------------------
date_columns = ["Order_Date", "Signup_Date"]  # change if needed
for date_col in date_columns:
    if date_col in df.columns:
        df = df.withColumn(date_col, to_date(col(date_col), "yyyy-MM-dd"))

# --------------------------
# 2. Convert Categorical Variables
# --------------------------
categorical_columns = ["Customer_City", "Product_Category"]  # change if needed

for cat_col in categorical_columns:
    if cat_col in df.columns:
        indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_Index", handleInvalid="keep")
        df = indexer.fit(df).transform(df)

        encoder = OneHotEncoder(inputCols=[f"{cat_col}_Index"], outputCols=[f"{cat_col}_Vec"])
        df = encoder.fit(df).transform(df)

# --------------------------
# 3. Normalize Numeric Columns
# --------------------------
numeric_columns = ["Order_Amount", "Quantity"]  # change if needed
numeric_columns = [col_name for col_name in numeric_columns if col_name in df.columns]

for num_col in numeric_columns:
    df = df.withColumn(num_col, col(num_col).cast(DoubleType()))

if len(numeric_columns) > 0:
    assembler = VectorAssembler(inputCols=numeric_columns, outputCol="numeric_features")
    df = assembler.transform(df)

    scaler = StandardScaler(inputCol="numeric_features", outputCol="numeric_features_scaled", withMean=True, withStd=True)
    df = scaler.fit(df).transform(df)
else:
    print("⚠️ No numeric columns found for normalization.")

# --------------------------
# 4. Show sample output
# --------------------------
df.show(5, truncate=False)

# --------------------------
# 5. Save transformed table
# --------------------------
df.write.format("delta").mode("overwrite").saveAsTable("default.customer_data_transformed")

print("✅ Transformation completed and saved as table: customer_data_transformed")
