In [None]:
# ----------------------------------------
# Traffic Data Clustering - GMM with Hyperparameter Tuning 
# ----------------------------------------

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, PCA
from pyspark.ml.clustering import GaussianMixture
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

# Import the model training function from your model file
from model.model import train_gmm_model  # Assuming model.py is in the model folder

# -------------------------------
# Step 0: Initialize Spark Session
# -------------------------------
spark = SparkSession.builder.appName("TrafficDataModelOptimization").getOrCreate()

# Disable Arrow Optimization in Spark to prevent issues with VectorUDT conversion
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")

# -------------------------------
# Step 1: Load Data
# -------------------------------
input_path = "dbfs:/user/mehak/processed/berlin_clean.csv"
spark_df = spark.read.csv(input_path, header=True, inferSchema=True)

print("✅ Data Loaded Successfully!")
spark_df.show(5)
spark_df.printSchema()

# -------------------------------
# Step 2: Drop any old feature columns (Safety Check)
# -------------------------------
# Ensure any previous features are dropped to avoid conflict
for col in ['features', 'scaledFeatures', 'input_features', 'scaled_features']:
    if col in spark_df.columns:
        spark_df = spark_df.drop(col)

# -------------------------------
# Step 3: Use External Model Code to Train the GMM Model
# -------------------------------
# Call the train_gmm_model function from your model.py file to train the model
gmm_model = train_gmm_model(spark_df, k=5)

# -------------------------------
# Step 4: Apply the Model and Generate Predictions
# -------------------------------
gmm_result = gmm_model.transform(spark_df)

print("✅ GMM Model Fitted Successfully!")
gmm_result.select("name", "berlin_bez", "prediction").show(10)

# -------------------------------
# Step 5: Manual Hyperparameter Tuning using Silhouette Score
# -------------------------------
evaluator = ClusteringEvaluator(featuresCol="scaled_features", predictionCol="prediction")
best_k = None
best_score = -1
silhouette_scores = []

k_range = [2, 3, 4, 5, 6, 7]
for k_val in k_range:
    gmm_temp = GaussianMixture(k=k_val, featuresCol="scaled_features", predictionCol="prediction")
    pipeline_temp = Pipeline(stages=indexers_high + [assembler, scaler, gmm_temp])
    model_temp = pipeline_temp.fit(spark_df)
    result_temp = model_temp.transform(spark_df)
    score = evaluator.evaluate(result_temp)
    silhouette_scores.append(score)
    print(f"k={k_val}, Silhouette Score = {score}")
    
    if score > best_score:
        best_score = score
        best_k = k_val

print(f"✅ Best k based on Silhouette Score: {best_k}")

# Plot Silhouette Scores for different k values
plt.figure(figsize=(8, 6))
plt.plot(k_range, silhouette_scores, marker='o')
plt.title("Silhouette Score for Different k (GMM)")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.xticks(k_range)
plt.grid(True)
plt.show()

# -------------------------------
# Step 6: PCA for Visualization (3D)
# -------------------------------
# Convert scaled_features to array
to_array_udf = udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))
cleaned_df = gmm_result.withColumn("scaledArray", to_array_udf("scaled_features"))

# Apply PCA
pca = PCA(k=3, inputCol="scaled_features", outputCol="pca_features")
pca_model = pca.fit(cleaned_df)
pca_df = pca_model.transform(cleaned_df)

# Collect PCA data into Pandas
plot_df = pca_df.select("pca_features", "prediction").rdd.map(lambda row: (row['pca_features'], row['prediction'])).toDF(["pcaArray", "prediction"]).toPandas()

# Extract x, y, z for 3D plotting
plot_df["x"] = plot_df["pcaArray"].apply(lambda x: float(x[0]))
plot_df["y"] = plot_df["pcaArray"].apply(lambda x: float(x[1]))
plot_df["z"] = plot_df["pcaArray"].apply(lambda x: float(x[2]))

# 3D Plot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
for cluster_id in plot_df["prediction"].unique():
    subset = plot_df[plot_df["prediction"] == cluster_id]
    ax.scatter(subset["x"], subset["y"], subset["z"], label=f"Cluster {cluster_id}", alpha=0.6)
ax.set_xlabel('PCA Feature 1')
ax.set_ylabel('PCA Feature 2')
ax.set_zlabel('PCA Feature 3')
ax.set_title("GMM Clusters (3D PCA Projection)")
ax.legend()
plt.show()

# -------------------------------
# Step 7: Cluster Summary
# -------------------------------
cluster_summary = gmm_result.groupBy("prediction").mean("zahl_tvz", "vz_typ_no", "lor_prg")
cluster_summary.show()

# -------------------------------
# Step 8: Visualize Cluster Characteristics
# -------------------------------
pdf = gmm_result.toPandas()

sns.boxplot(x="prediction", y="zahl_tvz", data=pdf)
plt.title("Traffic Volume Distribution by Cluster")
plt.show()

sns.boxplot(x="prediction", y="vz_typ_no", data=pdf)
plt.title("Vehicle Type Distribution by Cluster")
plt.show()

gmm_result.write.format("delta").mode("overwrite").saveAsTable("traffic_databricks_ws.default.traffic_data_gmm_clusters")
