# Imports

In [1]:
from pathlib import Path

import mlflow
import mlflow.spark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = (SparkSession
         .builder
         .appName('MLflow')
         .getOrCreate())
spark

# Data

In [4]:
DATA_PATH = Path('../data')
%ls {DATA_PATH}

2015-summary.csv         [1m[36msf-airbnb-clean.parquet[m[m/ sf-airbnb.csv


In [5]:
df = (spark
      .read
      .parquet((DATA_PATH / 'sf-airbnb-clean.parquet').as_posix()))
df.count()

7146

In [6]:
df.columns

['host_is_superhost',
 'cancellation_policy',
 'instant_bookable',
 'host_total_listings_count',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'minimum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'price',
 'bedrooms_na',
 'bathrooms_na',
 'beds_na',
 'review_scores_rating_na',
 'review_scores_accuracy_na',
 'review_scores_cleanliness_na',
 'review_scores_checkin_na',
 'review_scores_communication_na',
 'review_scores_location_na',
 'review_scores_value_na']

In [7]:
train_df, test_df = df.randomSplit([.8, .2], seed=42)
train_df.count(), test_df.count()

(5780, 1366)

In [8]:
cat_cols = [field for (field, dtype) in train_df.dtypes if dtype == 'string']
num_cols = [field for (field, dtype) in train_df.dtypes if dtype != 'string' and field != 'price']

In [9]:
cat_cols

['host_is_superhost',
 'cancellation_policy',
 'instant_bookable',
 'neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type']

In [10]:
num_cols

['host_total_listings_count',
 'latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'minimum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'bedrooms_na',
 'bathrooms_na',
 'beds_na',
 'review_scores_rating_na',
 'review_scores_accuracy_na',
 'review_scores_cleanliness_na',
 'review_scores_checkin_na',
 'review_scores_communication_na',
 'review_scores_location_na',
 'review_scores_value_na']

# MLflow

In [11]:
indexed_cols = [col + '_indexed' for col in cat_cols]

indexer = (StringIndexer()
           .setInputCols(cat_cols)
           .setOutputCols(indexed_cols)
           .setHandleInvalid('skip'))

vector_assembler = (VectorAssembler()
                    .setInputCols(indexed_cols + num_cols)
                    .setOutputCol('features'))

In [12]:
rf = RandomForestRegressor(featuresCol='features', labelCol='price', maxBins=40, seed=42)
rf_pipeline = Pipeline(stages=[indexer, vector_assembler, rf])

In [13]:
with mlflow.start_run(run_name='random-forest') as run:
    # Log hyperparameters
    mlflow.log_param('num_trees', rf.getNumTrees())
    mlflow.log_param('max_depth', rf.getMaxDepth())
    
    # Log model
    pip_model = rf_pipeline.fit(train_df)
    mlflow.spark.log_model(pip_model, 'model')
    
    # Log metrics
    pred_df = pip_model.transform(test_df)
    
    evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction')
    rmse = evaluator.setMetricName('rmse').evaluate(pred_df)
    r2 = evaluator.setMetricName('r2').evaluate(pred_df)
    mlflow.log_metrics({'r2': r2, 'rmse': rmse})
    
    # Log artifacts: feature importance
    feat_imp = (pd.DataFrame(
        list(
            zip(
                vector_assembler.getInputCols(), pip_model.stages[-1].featureImportances
            )
        ), columns=['feature', 'importance'])
                .sort_values(by='importance', ascending=False)
               )
    feat_imp.to_csv('feature-importance.csv', index=False)
    mlflow.log_artifact('feature-importance.csv')

In [14]:
!mlflow ui

  and should_run_async(code)


[2020-10-06 09:22:47 -0500] [43067] [INFO] Starting gunicorn 20.0.4
[2020-10-06 09:22:47 -0500] [43067] [INFO] Listening at: http://127.0.0.1:5000 (43067)
[2020-10-06 09:22:47 -0500] [43067] [INFO] Using worker: sync
[2020-10-06 09:22:47 -0500] [43069] [INFO] Booting worker with pid: 43069
^C
[2020-10-06 09:23:12 -0500] [43067] [INFO] Handling signal: int
[2020-10-06 09:23:13 -0500] [43069] [INFO] Worker exiting (pid: 43069)
