Fit a linear regression model and decision tree regression model with code adapted from https://medium.com/international-school-of-ai-data-science/linear-regression-with-pyspark-c5e2d0012072

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # ensure time zone isn't changed to Australian
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [40]:
# read yellow and green hourly data merged with weather
yellow = spark.read.parquet('../../mast30034-project-1-janggani/data/curated/yellow/yellow_w.parquet', inferSchema=True, header =True)
green = spark.read.parquet('../../mast30034-project-1-janggani/data/curated/green/green_w.parquet', inferSchema=True, header =True)

yellow_data = yellow.select("temp_f", "wind speed", "distance")
green_data = green.select("temp_f", "wind speed", "distance")

In [41]:
# use vector assembler
features = ["temp_f", "wind speed"]
output_c = "Temperature and Wind Speed"
featureassembler = VectorAssembler(inputCols = features, outputCol = output_c)

# get output
output_y = featureassembler.transform(yellow_data)
output_g = featureassembler.transform(green_data)
# select input and output
final_y = output_y.select(output_c, "distance")
final_g = output_g.select(output_c, "distance")

In [42]:
# split
train_data_y, test_data_y = final_y.randomSplit([0.9,0.1])
train_data_g, test_data_g = final_g.randomSplit([0.9,0.1])

In [43]:
regressor = LinearRegression(featuresCol = output_c, labelCol = "distance")

regressor_y = regressor.fit(train_data_y)
regressor_g = regressor.fit(train_data_g)

22/08/25 00:05:26 WARN Instrumentation: [654bd46f] regParam is zero, which might cause numerical instability and overfitting.
22/08/25 00:05:26 WARN Instrumentation: [01742b94] regParam is zero, which might cause numerical instability and overfitting.


In [44]:
pred_y = regressor_y.evaluate(test_data_y)
pred_g = regressor_g.evaluate(test_data_g)

In [45]:
pred_y.predictions.show(5)

+--------------------------+--------+------------------+
|Temperature and Wind Speed|distance|        prediction|
+--------------------------+--------+------------------+
|              [44.96,6.21]|    4.15| 3.302072378932335|
|               [45.14,5.9]|    2.53| 3.307125021851399|
|               [45.5,7.95]|    2.75|3.2740328966002967|
|               [46.04,6.4]|    2.82|3.2992216848613545|
|              [46.58,5.28]|    3.79|3.3174535866923116|
+--------------------------+--------+------------------+
only showing top 5 rows



In [46]:
pred_g.predictions.show(5)

+--------------------------+--------+------------------+
|Temperature and Wind Speed|distance|        prediction|
+--------------------------+--------+------------------+
|             [18.86,13.67]|    4.63| 3.848749391513972|
|              [21.02,7.39]|     6.5| 3.901474758132559|
|               [21.2,7.15]|    4.53| 3.901934222884672|
|              [21.2,13.42]|    2.98| 3.814870128546572|
|             [24.98,13.42]|    2.59|3.7545343531792232|
+--------------------------+--------+------------------+
only showing top 5 rows



In [47]:
print(mae_y, mse_y, r2_y)

0.5411843308283301 0.5197068929319331 -0.018105472641386733


In [48]:
print(mae_g, mse_g, r2_g)

0.6243387845002774 0.8589720691618249 0.09618012574879786
