In [0]:
import mlflow
import mlflow.spark

from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, round, abs


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS sales_databricks_workspace.ml.mlflow_tmp;


In [0]:
model_uri = "models:/sales_databricks_workspace.ml.daily_sales_forecast_rf/1"

model = mlflow.spark.load_model(
    model_uri,
    dfs_tmpdir="/Volumes/sales_databricks_workspace/ml/mlflow_tmp"
)


In [0]:
features_df = spark.table("sales_databricks_workspace.gold.sales_details")


In [0]:
feature_cols = [
    "ROLLING_7D_UNITS",
    "ROLLING_30D_UNITS",
    "LAG_1D_UNITS"
]

target_col = "DAILY_UNITS_SOLD"


In [0]:
ml_df = features_df.dropna(subset=feature_cols + [target_col])


In [0]:
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

ml_ready_df = assembler.transform(ml_df)


In [0]:
predictions_df = model.transform(ml_ready_df)


In [0]:
final_df = (
    predictions_df
    .select(
        col("ORDER_DATE").alias("Date"),
        col("BRANCH_ID").alias("Branch"),
        col("ITEM_ID").alias("Item"),
        col("DAILY_UNITS_SOLD").alias("Actual_Units"),
        round(col("prediction"), 0).cast("int").alias("Predicted_Units"),
        round(abs(col("prediction") - col("DAILY_UNITS_SOLD")), 2).alias("Error")
    )
    .orderBy("Date")
)


In [0]:
final_df.display()

Date,Branch,Item,Actual_Units,Predicted_Units,Error
2021-01-02,56-AN4,68,2,4,2.0
2021-01-02,661-TR2,12806,2,4,2.0
2021-01-02,56-AN4,501,6,5,1.25
2021-01-02,17-AN3,13179,8,7,0.96
2021-01-02,56-AN4,591,2,2,0.07
2021-01-02,17-AN3,13246,7,6,1.17
2021-01-02,754-SA2,608,4,5,1.22
2021-01-02,133-ME1,14253,7,6,0.72
2021-01-02,421-DI2,854,5,4,0.82
2021-01-02,310-BA2,14342,6,6,0.16


In [0]:
from pyspark.sql.functions import when


In [0]:
performance_df = final_df.withColumn(
    "Performance",
    when(col("Error") <= 2, "Success").otherwise("Failure")
)


In [0]:
summary_df = performance_df.groupBy("Performance").count()
summary_df.display()


Performance,count
Success,29056557
Failure,16909246


In [0]:
from pyspark.sql.functions import sum as spark_sum, col, round

total_count = summary_df.agg(spark_sum("count")).collect()[0][0]

final_summary_df = (
    summary_df
    .withColumn(
        "Percentage",
        round((col("count") / total_count) * 100, 2)
    )
)

final_summary_df.display()


Performance,count,Percentage
Success,29056557,63.21
Failure,16909246,36.79
