In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("MapsaBigData").getOrCreate()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
filePath = """./databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet"""

airbnbDF = spark.read.parquet(filePath)

(trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42)

categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")

numericCols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price"))]

assemblerInputs = indexOutputCols + numericCols

vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=5, numTrees=100, seed=42)

pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

In [4]:
pip install mlflow

Collecting mlflow
  Using cached mlflow-1.28.0-py3-none-any.whl (17.0 MB)
Collecting gunicorn<21
  Using cached gunicorn-20.1.0-py3-none-any.whl (79 kB)
Collecting sqlparse<1,>=0.4.0
  Using cached sqlparse-0.4.2-py3-none-any.whl (42 kB)
Collecting prometheus-flask-exporter<1
  Using cached prometheus_flask_exporter-0.20.3-py3-none-any.whl (18 kB)
Collecting databricks-cli<1,>=0.8.7
  Using cached databricks_cli-0.17.3-py3-none-any.whl
Collecting Flask<3
  Using cached Flask-2.2.2-py3-none-any.whl (101 kB)
Collecting docker<6,>=4.0.0
  Using cached docker-5.0.3-py2.py3-none-any.whl (146 kB)
Collecting querystring-parser<2
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting gitpython<4,>=2.1.0
  Using cached GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting tabulate>=0.7.7
  Using cached tabulate-0.8.10-py3-none-any.whl (29 kB)
Collecting Werkzeug>=2.2.2
  Using cached Werkzeug-2.2.2-py3-none-any.whl (232 kB)
Collecting itsdangerous>=2.0
  Using cached i

In [5]:
import mlflow
import mlflow.spark
import pandas as pd

with mlflow.start_run(run_name="linear-regression") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")
    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
    labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(),
    rfModel.featureImportances)),
    columns=["feature", "importance"])
    .sort_values(by="importance", ascending=False))
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature-importance.csv", index=False)
    mlflow.log_artifact("feature-importance.csv")

