In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
file_path = "./data/sf-airbnb/sf-airbnb-clean.parquet"

In [6]:
airbnbDF = spark.read.parquet(file_path)

                                                                                

In [7]:
(trainDF, testDF) = airbnbDF.randomSplit([0.8, 0.2], seed = 42)

In [8]:
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "String"]

In [9]:
indexOutputCols = [x + "Index" for x in categoricalCols]

In [10]:
stringIndexer = StringIndexer(inputCols = categoricalCols,
                             outputCols = indexOutputCols,
                             handleInvalid = "skip")

In [11]:
numericCols = [field for (field, dataType) in trainDF.dtypes
              if ((dataType == "double") & (field != "price"))]

In [12]:
assemblerInputs = indexOutputCols + numericCols

In [13]:
vecAssembler = VectorAssembler(inputCols = assemblerInputs,
                              outputCol = "features")

In [14]:
rf = RandomForestRegressor(labelCol = "price", maxBins = 40, maxDepth = 5,
                          numTrees = 100, seed = 42)

In [15]:
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

In [18]:
import mlflow

In [19]:
import mlflow.spark

In [20]:
import pandas as pd

In [None]:
with mlflow.start_run(run_name = "random-forest") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
    # Log model
    pipeline_model = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipeline_model, 'model')
    
    # Log metrics: RMSE and R2
    predDF = pipeline_model.fit(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol = "prediction",
                                             labelCol = "price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    # Log artifact: feature importance scores
    rfModel = pipeline_model.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(),
                                     rfModel.featureImportances)),
                            columns = ["feature", "importance"])
               .sort_values(by = "importance", ascending = False))
    
    