In [0]:
from pyspark.sql import SparkSession

data = [
    ("Urban", "Apartment", 2, 1, 300000),
    ("Rural", "Villa", 4, 3, 500000),
    ("Urban", "Apartment", 3, 2, 320000),
    ("Suburb", "Townhouse", 3, 2, 450000),
    ("Rural", "Villa", 5, 3, 520000),
]

columns = ["Location", "HouseType", "Bedrooms", "Bathrooms", "Price"]
df = spark.createDataFrame(data, columns)

In [0]:
categorical_cols = ["Location", "HouseType"]
index_output_cols = [col + "_index" for col in categorical_cols]
numeric_cols = ["Bedrooms", "Bathrooms"]  # Not including Price


In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Define indexer
string_indexer = StringIndexer(
    inputCols=categorical_cols,
    outputCols=index_output_cols,
    handleInvalid="skip"
)

# Define vector assembler
vec_assembler = VectorAssembler(
    inputCols=index_output_cols + numeric_cols,
    outputCol="features"
)


In [0]:
pipeline = Pipeline(stages=[string_indexer, vec_assembler])
pipeline_model = pipeline.fit(df)
prepared_df = pipeline_model.transform(df)

In [0]:
prepared_df.select("Location", "HouseType", *index_output_cols, *numeric_cols, "features").show(truncate=False)


In [0]:
train_df, test_df=prepared_df.randomSplit([0.8,0.2], seed=42)

In [0]:
from pyspark.ml.regression import RandomForestRegressor

rf=RandomForestRegressor(
                        featuresCol="features", 
                         labelCol="Price",
                         predictionCol="prediction"
                         )
rf_model=rf.fit(train_df)

In [0]:
predictions = rf_model.transform(test_df)

In [0]:
predictions.select("features", "Price", "prediction").display()