# Imports

In [1]:
from pathlib import Path

import mlflow
import mlflow.spark

import pandas as pd
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ML in Spark').getOrCreate()
spark

  and should_run_async(code)


In [3]:
DATA_PATH = Path('data')
!ls {DATA_PATH}

airport-codes-na.txt      feature-importance.csv    sf-fire-calls.csv
departuredelays.csv       loan-risks.snappy.parquet [1m[36mtest-df.parquet[m[m


# Data

In [6]:
df = (spark
      .read
      .parquet((DATA_PATH / 'sf-airbnb-clean.parquet').as_posix()))
df.count()

7146

In [7]:
df.columns

['host_is_superhost',
 'cancellation_policy',
 'instant_bookable',
 'host_total_listings_count',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'minimum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'price',
 'bedrooms_na',
 'bathrooms_na',
 'beds_na',
 'review_scores_rating_na',
 'review_scores_accuracy_na',
 'review_scores_cleanliness_na',
 'review_scores_checkin_na',
 'review_scores_communication_na',
 'review_scores_location_na',
 'review_scores_value_na']

In [8]:
df.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms",
"number_of_reviews", "price").show(5)

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



In [9]:
train_df, test_df = df.randomSplit([.8, .2], seed=42)
train_df.count(), test_df.count()

(5780, 1366)

In [10]:
train_df.dtypes

[('host_is_superhost', 'string'),
 ('cancellation_policy', 'string'),
 ('instant_bookable', 'string'),
 ('host_total_listings_count', 'double'),
 ('neighbourhood_cleansed', 'string'),
 ('latitude', 'double'),
 ('longitude', 'double'),
 ('property_type', 'string'),
 ('room_type', 'string'),
 ('accommodates', 'double'),
 ('bathrooms', 'double'),
 ('bedrooms', 'double'),
 ('beds', 'double'),
 ('bed_type', 'string'),
 ('minimum_nights', 'double'),
 ('number_of_reviews', 'double'),
 ('review_scores_rating', 'double'),
 ('review_scores_accuracy', 'double'),
 ('review_scores_cleanliness', 'double'),
 ('review_scores_checkin', 'double'),
 ('review_scores_communication', 'double'),
 ('review_scores_location', 'double'),
 ('review_scores_value', 'double'),
 ('price', 'double'),
 ('bedrooms_na', 'double'),
 ('bathrooms_na', 'double'),
 ('beds_na', 'double'),
 ('review_scores_rating_na', 'double'),
 ('review_scores_accuracy_na', 'double'),
 ('review_scores_cleanliness_na', 'double'),
 ('review_sco

# Linear Regression

In [11]:
cat_cols = [field for (field, dtype) in train_df.dtypes if dtype == 'string']
num_cols = [field for (field, dtype) in train_df.dtypes if dtype != 'string' and field != 'price']
print(f'Categorical fields:\n{cat_cols}\n')
print(f'Numerical fields:\n{num_cols}')

Categorical fields:
['host_is_superhost', 'cancellation_policy', 'instant_bookable', 'neighbourhood_cleansed', 'property_type', 'room_type', 'bed_type']

Numerical fields:
['host_total_listings_count', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'bedrooms_na', 'bathrooms_na', 'beds_na', 'review_scores_rating_na', 'review_scores_accuracy_na', 'review_scores_cleanliness_na', 'review_scores_checkin_na', 'review_scores_communication_na', 'review_scores_location_na', 'review_scores_value_na']


In [12]:
indexed_cols = [col + '_indexed' for col in cat_cols]
encoded_cols = [col + '_encoded' for col in cat_cols]

indexer = (StringIndexer()
           .setInputCols(cat_cols)
           .setOutputCols(indexed_cols)
           .setHandleInvalid('skip'))

ohe = (OneHotEncoder()
       .setInputCols(indexed_cols)
       .setOutputCols(encoded_cols))

vector_assembler = (VectorAssembler()
                    .setInputCols(encoded_cols)
                    .setOutputCol('features'))

In [13]:
lr = LinearRegression(featuresCol='features', labelCol='price')
lr_pipeline = Pipeline(stages=[indexer, ohe, vector_assembler, lr])

In [14]:
lr_pipeline_model = lr_pipeline.fit(train_df)

In [15]:
pred_df = lr_pipeline_model.transform(test_df)
pred_df.select('features', 'price', 'prediction').show(5)

+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(72,[0,3,6,22,43,...| 85.0|237.64086207083028|
|(72,[0,3,6,22,43,...| 45.0| 73.42133044353568|
|(72,[0,3,6,22,43,...| 70.0| 73.42133044353568|
|(72,[0,3,6,12,42,...|128.0|11.787699210826759|
|(72,[0,3,6,12,43,...|159.0|132.70651723063725|
+--------------------+-----+------------------+
only showing top 5 rows



In [16]:
reg_eval = RegressionEvaluator(predictionCol='prediction', labelCol='price', metricName='rmse')
reg_eval.evaluate(pred_df)

228.26400958005436

In [17]:
reg_eval.setMetricName('r2').evaluate(pred_df)

0.1007840913115724

In [18]:
lr_pipeline_model.write().overwrite().save('linear_reg_pip')

In [19]:
lr_pipeline_model = PipelineModel.load('linear_reg_pip')

In [20]:
reg_eval.setMetricName('r2').evaluate(lr_pipeline_model.transform(test_df))

0.1007840913115724

# RandomForest

In [21]:
indexed_cols = [col + '_indexed' for col in cat_cols]

indexer = (StringIndexer()
           .setInputCols(cat_cols)
           .setOutputCols(indexed_cols)
           .setHandleInvalid('skip'))

vector_assembler = (VectorAssembler()
                    .setInputCols(indexed_cols)
                    .setOutputCol('features'))

In [22]:
rf = RandomForestRegressor(featuresCol='features', labelCol='price', maxBins=40, seed=42)
rf_pipeline = Pipeline(stages=[indexer, vector_assembler, rf])

In [23]:
rf_pipeline_model = rf_pipeline.fit(train_df)

In [24]:
pred_df = rf_pipeline_model.transform(test_df)
pred_df.select('features', 'price', 'prediction').show(5)

+--------------------+-----+------------------+
|            features|price|        prediction|
+--------------------+-----+------------------+
|(7,[1,3,4],[2.0,1...| 85.0| 274.3913584878296|
|[0.0,2.0,0.0,15.0...| 45.0| 99.25241040164991|
|[0.0,2.0,0.0,15.0...| 70.0| 99.25241040164991|
|(7,[1,3,5],[2.0,5...|128.0|111.09933784188517|
|[0.0,2.0,0.0,5.0,...|159.0|112.73108938398084|
+--------------------+-----+------------------+
only showing top 5 rows



In [25]:
reg_eval = RegressionEvaluator(predictionCol='prediction', labelCol='price', metricName='rmse')
reg_eval.evaluate(pred_df)

232.41858310308692

In [26]:
reg_eval.setMetricName('r2').evaluate(pred_df)

0.06775342383805139

In [30]:
(pd.DataFrame(
    list(
        zip(rf_pipeline_model.stages[-2].getInputCols(), rf_pipeline_model.stages[-1].featureImportances))
    , columns=['feature', 'importance'])
 .sort_values(by='importance', ascending=False))

  and should_run_async(code)


Unnamed: 0,feature,importance
1,cancellation_policy_indexed,0.258472
5,room_type_indexed,0.243863
3,neighbourhood_cleansed_indexed,0.234067
4,property_type_indexed,0.191982
2,instant_bookable_indexed,0.043699
0,host_is_superhost_indexed,0.026677
6,bed_type_indexed,0.001241


# Hyperparameter Tuning

In [31]:
indexed_cols = [col + '_indexed' for col in cat_cols]

indexer = (StringIndexer()
           .setInputCols(cat_cols)
           .setOutputCols(indexed_cols)
           .setHandleInvalid('skip'))

vector_assembler = (VectorAssembler()
                    .setInputCols(indexed_cols)
                    .setOutputCol('features'))

  and should_run_async(code)


In [32]:
rf = RandomForestRegressor(featuresCol='features', labelCol='price', maxBins=40, seed=42)

In [33]:
rf_pipeline = Pipeline(stages=[indexer, vector_assembler, rf])

In [34]:
evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse')

In [35]:
param_grid = (ParamGridBuilder()
              .addGrid(rf.maxDepth, [2, 4, 6])
              .addGrid(rf.numTrees, [10, 100])
              .build())

In [36]:
cv = (CrossValidator()
      .setEstimator(rf_pipeline)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(param_grid)
      .setNumFolds(3)
      .setSeed(42))

In [37]:
%time cv.fit(train_df)

CPU times: user 997 ms, sys: 232 ms, total: 1.23 s
Wall time: 24.9 s


CrossValidatorModel_01b95d4b1c9b

In [38]:
cv = (CrossValidator()
      .setEstimator(rf_pipeline)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(param_grid)
      .setNumFolds(3)
      .setParallelism(4)
      .setSeed(42))

In [39]:
%time cv.fit(train_df)

CPU times: user 1.28 s, sys: 392 ms, total: 1.68 s
Wall time: 13.3 s


CrossValidatorModel_4922b911ac59

In [40]:
cv = (CrossValidator()
      .setEstimator(rf)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(param_grid)
      .setParallelism(4)
      .setNumFolds(3)
      .setSeed(42))

In [41]:
rf_pipeline = Pipeline(stages=[indexer, vector_assembler, cv])

In [42]:
%time rf_pipeline.fit(train_df)

CPU times: user 695 ms, sys: 189 ms, total: 885 ms
Wall time: 12.7 s


PipelineModel_13720f487a10