In [0]:
from pyspark.sql import SparkSession

data = [
    ("Urban", "Apartment", 2, 1, 300000),
    ("Rural", "Villa", 4, 3, 500000),
    ("Urban", "Apartment", 3, 2, 320000),
    ("Suburb", "Townhouse", 3, 2, 450000),
    ("Rural", "Villa", 5, 3, 520000),
]

columns = ["Location", "HouseType", "Bedrooms", "Bathrooms", "Price"]
df = spark.createDataFrame(data, columns)

In [0]:
https://spark.apache.org/docs/latest/ml-guide.html

In [0]:
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.StringIndexer.html

In [0]:
df.printSchema()

In [0]:
display(df)

Databricks data profile. Run in Databricks to view.

In [0]:
df.describe().show()

In [0]:
Why NOT One-Hot Encode for Decision Trees?

One-Hot Encoding (OHE) expands a categorical column into multiple binary columns.

However, for decision tree-based models, including Random Forests, Gradient-Boosted Trees, etc.:
	•	StringIndexer is preferred because:
	•	Spark ML tree algorithms natively handle indexed categorical features.
	•	OHE can mislead trees because it introduces artificial numeric scaling (sparsity, distance, etc.).
	•	OHE expands feature space unnecessarily and slows computation.

📘 Conclusion: Use StringIndexer + VectorAssembler for tree-based models. Avoid OneHotEncoder.

In [0]:
find all categorical columns

In [0]:
categorical_cols=[field for (field,dataType)in df.dtypes if dataType=='string']

In [0]:
%sql
create indexed outpur column names

In [0]:
index_output_cols= [x + "_index" for x in categorical_cols]

In [0]:
from pyspark.ml.feature import StringIndexer

string_indexer=StringIndexer(
    inputCols=categorical_cols,
    outputCols=index_output_cols
)

In [0]:
df

In [0]:
numeric_cols=[fields for (fields,dataType)in df.dtypes if ((dataType=='bigint')|(dataType=='double'))]

In [0]:
numeric_cols

In [0]:
#combined indexed and numerics 
assembler_input=index_output_cols+numeric_cols

vectorassembler combines all categorical(index) and numercial feature into single vector

In [0]:
from pyspark.ml.feature import VectorAssembler

vec_assembler=VectorAssembler(
    inputCols=assembler_input,
    outputCol="features"
)

In [0]:
A pipeline in sparkml is a similar to scikit learn pipeline.

In [0]:
from pyspark.ml import Pipeline

pipeline=Pipeline(stages=[string_indexer,vec_assembler])
pipeline_model=pipeline.fit(df)
prepared_df=pipeline_model.transform(df)

In [0]:
prepared_df.display()

In [0]:
prepared_df.select(col("prediction.values")).display()

In [0]:
from pyspark.sql.functions import col

In [0]:
prepared_df.select(col("prediction").getField("values"))

In [0]:
#prepared_df.select("features","Price")

In [0]:
prepared_df.select("features","Price")

In [0]:
train_df, test_df=prepared_df.randomSplit([0.8,0.2], seed=42)

In [0]:
from pyspark.ml.regression import RandomForestRegressor

rf=RandomForestRegressor(
                        featuresCol="features", 
                         labelCol="Price",
                         predictionCol="prediction"
                         )
rf_model=rf.fit(train_df)