In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.regression import *
from pyspark.ml.feature import *
from pyspark.ml import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *

In [0]:
ml_df = spark.table("airbnb1.gold.ml_df")


In [0]:
room_indexer = StringIndexer(inputCol="room_type", outputCol="room_type_index", handleInvalid="keep")

In [0]:
assembler = VectorAssembler(
    inputCols=["accommodates", "bedrooms", "bathrooms", "number_of_reviews", "beds", "room_type_index"],
    outputCol="features"
)

In [0]:
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="price",
    numTrees=300,
    maxBins=64,
    maxDepth=20,
    seed=42
)

In [0]:
pipeline = Pipeline(stages=[room_indexer, assembler, rf])

In [0]:
train_df , test_df = ml_df.randomSplit([0.8, 0.2], seed=42)

In [0]:
model = pipeline.fit(train_df)

In [0]:
predictions = model.transform(test_df)
predictions.select("price","room_type_index","prediction").show(10)

In [0]:
evaluation = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluation.evaluate(predictions)
print(rmse)

In [0]:
e1 = RegressionEvaluator(
    labelCol="price",
    predictionCol="prediction",
    metricName="r2"
)
r2 = e1.evaluate(predictions)
print(r2)