In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
file_path = "./data/sf-airbnb/sf-airbnb-clean.parquet"

In [6]:
airbnbDF = spark.read.parquet(file_path)

                                                                                

In [7]:
(trainDF, testDF) = airbnbDF.randomSplit([0.8, 0.2], seed = 42)

In [27]:
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]

In [28]:
indexOutputCols = [x + "Index" for x in categoricalCols]

In [29]:
stringIndexer = StringIndexer(inputCols = categoricalCols,
                             outputCols = indexOutputCols,
                             handleInvalid = "skip")

In [30]:
numericCols = [field for (field, dataType) in trainDF.dtypes
              if ((dataType == "double") & (field != "price"))]

In [31]:
assemblerInputs = indexOutputCols + numericCols

In [32]:
vecAssembler = VectorAssembler(inputCols = assemblerInputs,
                              outputCol = "features")

In [33]:
rf = RandomForestRegressor(labelCol = "price", maxBins = 40, maxDepth = 5,
                          numTrees = 100, seed = 42)

In [34]:
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

In [35]:
import mlflow

In [36]:
import mlflow.spark

In [37]:
import pandas as pd

In [38]:
with mlflow.start_run(run_name = "random-forest") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
    # Log model
    pipeline_model = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipeline_model, 'model')
    
    # Log metrics: RMSE and R2
    predDF = pipeline_model.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol = "prediction",
                                             labelCol = "price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    # Log artifact: feature importance scores
    rfModel = pipeline_model.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(),
                                     rfModel.featureImportances)),
                            columns = ["feature", "importance"])
               .sort_values(by = "importance", ascending = False))
    
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature_importance.csv", index = False)
    mlflow.log_artifact("feature_importance.csv")

In [39]:
from mlflow.tracking import MlflowClient

In [40]:
client = MlflowClient()

In [41]:
runs = client.search_runs(run.info.experiment_id,
                         order_by = ["attributes.start_time desc"],
                         max_results = 1)

In [42]:
run_id = runs[0].info.run_id

In [44]:
run_id 

'54f5ec008779403e84ffdc1894d56d14'

In [43]:
runs[0].data.metrics

{'r2': 0.22794251914574226, 'rmse': 211.5096898777315}

In [49]:
pipeline_model = mlflow.spark.load_model(f"runs:/{run_id}/model")

2023/12/02 07:46:49 INFO mlflow.spark: 'runs:/54f5ec008779403e84ffdc1894d56d14/model' resolved as 'file:///Users/jiashu/Documents/StudyNotes/spark/examples/mlruns/0/54f5ec008779403e84ffdc1894d56d14/artifacts/model'
2023/12/02 07:46:49 INFO mlflow.spark: URI 'runs:/54f5ec008779403e84ffdc1894d56d14/model/sparkml' does not point to the current DFS.
2023/12/02 07:46:49 INFO mlflow.spark: File 'runs:/54f5ec008779403e84ffdc1894d56d14/model/sparkml' not found on DFS. Will attempt to upload the file.
