
## Exercise

Build a random forest (`RandomForestRegressor`) to predict the `price` of the apartments based on

- `bedrooms`
- `cancellation_policy`
- `number_of_reviews`

You can train the model on the _training_ set and make predictions on the _test_ set.

Goal

1. compute RMSE and R2  on the test set
2. what is the importance of the features?

PS Does a random forest actually need one-hot encoding of categorical vaiables?

In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(
    inputCols=["cancellation_policy"],
    outputCols=["cancellation_policy_index"],
    handleInvalid="skip"
)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["bedrooms", "cancellation_policy_index", "number_of_reviews"],
    outputCol="features"
)

In [0]:
from pyspark.ml.regression import RandomForestRegressor

rf_regressor = RandomForestRegressor(
    featuresCol="features",
    labelCol="price"
)

In [0]:
from pyspark.ml import Pipeline
pipeline = Pipeline(
    stages=[
        indexer, #estimator fit
        assembler, #transformer transform
        rf_regressor #estimator fit
    ]
)

In [0]:
filepath = "/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet/"

df = spark.read.parquet(filepath)
trainDF, testDF = df.randomSplit([0.8, 0.2], seed=42)

In [0]:
model = pipeline.fit(trainDF)

In [0]:
predictions = model.transform(testDF)
display(predictions)

host_is_superhost,cancellation_policy,instant_bookable,host_total_listings_count,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,minimum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,price,bedrooms_na,bathrooms_na,beds_na,review_scores_rating_na,review_scores_accuracy_na,review_scores_cleanliness_na,review_scores_checkin_na,review_scores_communication_na,review_scores_location_na,review_scores_value_na,cancellation_policy_index,features,prediction
f,flexible,f,1.0,Bayview,37.72001,-122.39249,House,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,2.0,128.0,97.0,10.0,10.0,10.0,10.0,9.0,10.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 128.0))",115.78667121577897
f,flexible,f,1.0,Bayview,37.7325,-122.39221,House,Private room,1.0,1.0,1.0,1.0,Real Bed,31.0,0.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,45.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 0.0))",168.40283338502817
f,flexible,f,1.0,Bayview,37.73555,-122.39779,House,Private room,1.0,1.0,1.0,1.0,Real Bed,30.0,0.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,70.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 0.0))",168.40283338502817
f,flexible,f,1.0,Bernal Heights,37.73905,-122.41269,Apartment,Private room,1.0,1.0,1.0,1.0,Real Bed,30.0,1.0,80.0,10.0,8.0,10.0,10.0,8.0,10.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 1.0))",160.92562020454594
f,flexible,f,1.0,Bernal Heights,37.74473,-122.41516,House,Private room,1.0,1.0,1.0,1.0,Real Bed,1.0,3.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,159.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 3.0))",168.17223456366952
f,flexible,f,1.0,Bernal Heights,37.74697,-122.41193,House,Entire home/apt,3.0,1.0,2.0,2.0,Real Bed,2.0,15.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(2.0, 2.0, 15.0))",255.11489197727897
f,flexible,f,1.0,Castro/Upper Market,37.76269,-122.43188,Apartment,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,30.0,1.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 1.0))",160.92562020454594
f,flexible,f,1.0,Chinatown,37.79199,-122.40704,Apartment,Entire home/apt,3.0,1.0,1.0,3.0,Real Bed,45.0,0.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,95.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 0.0))",168.40283338502817
f,flexible,f,1.0,Financial District,37.78424,-122.39925,Apartment,Private room,2.0,1.0,1.0,1.0,Real Bed,180.0,0.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,100.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 0.0))",168.40283338502817
f,flexible,f,1.0,Financial District,37.79376,-122.39462,House,Entire home/apt,2.0,1.0,1.0,1.0,Real Bed,31.0,0.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,2010.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 2.0, 0.0))",168.40283338502817


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
evaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="price",
    metricName="r2" # R squared
)
evaluator.evaluate(predictions)

Out[9]: 0.13552726722130215

In [0]:
evaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="price",
    metricName="rmse" # R squared
)
evaluator.evaluate(predictions)

Out[10]: 223.73292882639853

In [0]:
model.stages[-1].featureImportances

Out[11]: SparseVector(3, {0: 0.6904, 1: 0.2142, 2: 0.0953})