In [None]:
# 🚀 Notebook 06: Model Deployment & Azure Integration

# 1. Import Required Libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.clustering import GaussianMixture
from pyspark.ml import Pipeline
import os

# 2. Start Spark Session
spark = SparkSession.builder.appName("ModelDeploymentAzureIntegration").getOrCreate()

# 3. Configure Azure Blob Storage Access Key (NO secrets used)
spark.conf.set(
    "fs.azure.account.key.mehaktrafficstore.blob.core.windows.net",
    "9ri/YNuCzOtJ+naNRqgt9HcG6J4q6MK5ef7/ubSH6N5HZqz+gvRrvrp4EqwvURZbpDfbv6B1lLmR+AStjBwedA=="
)

# 4. Load Cleaned Dataset
input_path = "dbfs:/user/mehak/processed/berlin_clean.csv"
spark_df = spark.read.csv(input_path, header=True, inferSchema=True)

# 5. Define Feature Columns
categorical_low = ["spatial_type"]         # One-hot encoding
categorical_high = ["name", "berlin_bez"]  # Label encoding
numeric_cols = ["zahl_tvz", "vz_typ_no", "lor_prg"]


# 6. Preprocessing Pipeline for GMM
indexers_high = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep") for col in categorical_high]
indexers_low = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep") for col in categorical_low]
encoders_low = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_vec") for col in categorical_low]

assembler_inputs = numeric_cols + [col + "_index" for col in categorical_high] + [col + "_vec" for col in categorical_low]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

# 7. GMM Clustering
gmm = GaussianMixture(k=5, featuresCol="scaledFeatures", predictionCol="prediction")
pipeline = Pipeline(stages=indexers_high + indexers_low + encoders_low + [assembler, scaler, gmm])
gmm_model = pipeline.fit(spark_df)
gmm_result = gmm_model.transform(spark_df)

# 8. Select Final Columns for Export
export_df = gmm_result.select("name", "spatial_type", "zahl_tvz", "vz_typ_no", "lor_prg", "prediction")
export_df.show(5)

# 9. Define Azure Blob Storage Output Paths
azure_output_path_parquet = "wasbs://traffic-data@mehaktrafficstore.blob.core.windows.net/final_cluster_results_parquet"
azure_output_path_csv = "wasbs://traffic-data@mehaktrafficstore.blob.core.windows.net/final_cluster_results_csv"

# 10. Save to Azure Blob Storage
# Save as Parquet (recommended for Power BI)
export_df.write.mode("overwrite").parquet(azure_output_path_parquet)

# Save as CSV (optional, Power BI also supports CSV)
export_df.write.mode("overwrite").option("header", True).csv(azure_output_path_csv)

# 11. Save Local Copy to DBFS for Download (as CSV)
# Ensure the local directory exists
os.makedirs("/dbfs/FileStore/tables/", exist_ok=True)

# Convert to Pandas and Save
pandas_df = export_df.toPandas()
pandas_df.to_csv("/dbfs/FileStore/tables/final_cluster_results.csv", index=False)

print("✅ Export completed: Azure + DBFS local CSV ready.")
