In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler, OneHotEncoder
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql import SparkSession

In [2]:
def create_spark():
    """ Create a SparkSession object. """
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("TestSuite") \
        .config(key='spark.sql.shuffle.partitions', value='4') \
        .config(key='spark.default.parallelism', value='4') \
        .config(key='spark.sql.session.timeZone', value='UTC') \
        .config(key='spark.ui.enabled', value='false') \
        .config(key='spark.app.id', value='Test') \
        .config(key='spark.driver.host', value='localhost') \
        .getOrCreate()

    return spark

In [3]:
spark = create_spark()

In [4]:
path_to_data = '../../dataset/CarPrice_Assignment.csv'

car_spark_df = spark.read.csv(path_to_data, header=True, inferSchema=True)

In [5]:
features = ['symboling', 'wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
            'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
            'peakrpm', 'citympg', 'highwaympg']
target = 'price'

In [6]:
raw_string_columns = ['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem']
indexed_string_columns = [col + "_index" for col in raw_string_columns]
encoded_string_columns = [col + "_ohe" for col in raw_string_columns]

indexer = StringIndexer(
    inputCols=raw_string_columns,
    outputCols=indexed_string_columns,
    handleInvalid="keep"  # optional
)

encoder = OneHotEncoder(
    inputCols=indexed_string_columns,
    outputCols=encoded_string_columns
)

car_spark_df = indexer.fit(car_spark_df).transform(car_spark_df)
car_spark_df = encoder.fit(car_spark_df).transform(car_spark_df)

In [7]:
vectorizer = VectorAssembler(inputCols=features+indexed_string_columns, outputCol='features')
car_spark_df = vectorizer.transform(car_spark_df)
scaler = MinMaxScaler(inputCol='features', outputCol='scaled_features').fit(car_spark_df)
scaled_data = scaler.transform(car_spark_df)

In [8]:
train, test = scaled_data.randomSplit([0.7, 0.3], seed=42)

In [9]:
rfr = RandomForestRegressor(featuresCol='scaled_features', labelCol='price')
model = rfr.fit(train)

In [10]:
predicted = model.transform(test)

In [11]:
metrics = {
    "r2": RegressionEvaluator(metricName="r2"),
    "rmse": RegressionEvaluator(metricName="rmse"),
    "mae": RegressionEvaluator(metricName="mae"),
}

for name, evaluator in metrics.items():
    evaluator.setLabelCol(target)
    print(f'Metric: {name} =  {evaluator.evaluate(predicted)}')

# Metric: r2 =  0.8938095382232254
# Metric: rmse =  2629.665399631598
# Metric: mae =  1935.7268639918877

Metric: r2 =  0.8938095382232254
Metric: rmse =  2629.665399631598
Metric: mae =  1935.7268639918877
