Fit a linear regression model and decision tree regression model with code adapted from https://medium.com/international-school-of-ai-data-science/linear-regression-with-pyspark-c5e2d0012072

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # ensure time zone isn't changed to Australian
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [3]:
# read yellow and green hourly data merged with weather
yellow = spark.read.parquet('../../mast30034-project-1-janggani/data/curated/yellow/yellow_w.parquet', inferSchema=True, header =True)
green = spark.read.parquet('../../mast30034-project-1-janggani/data/curated/green/green_w.parquet', inferSchema=True, header =True)

yellow_data = yellow.select("temp_f", "wind speed", "distance")
green_data = green.select("temp_f", "wind speed", "distance")

In [4]:
# use vector assembler
features = ["temp_f", "wind speed"]
output_c = "Temperature and Wind Speed"
featureassembler = VectorAssembler(inputCols = features, outputCol = output_c)

# get output
output_y = featureassembler.transform(yellow_data)
output_g = featureassembler.transform(green_data)
# select input and output
final_y = output_y.select(output_c, "distance")
final_g = output_g.select(output_c, "distance")

In [5]:
# split
train_data_y, test_data_y = final_y.randomSplit([0.9,0.1])
train_data_g, test_data_g = final_g.randomSplit([0.9,0.1])

In [7]:
regressor = LinearRegression(featuresCol = output_c, labelCol = "distance")

regressor_y = regressor.fit(train_data_y)
regressor_g = regressor.fit(train_data_g)

22/08/26 04:05:12 WARN Instrumentation: [5a6f9bf3] regParam is zero, which might cause numerical instability and overfitting.
22/08/26 04:05:13 WARN Instrumentation: [f94240aa] regParam is zero, which might cause numerical instability and overfitting.


In [8]:
pred_y = regressor_y.evaluate(test_data_y)
pred_g = regressor_g.evaluate(test_data_g)

In [9]:
pred_y.predictions.show(5)

+--------------------------+--------+------------------+
|Temperature and Wind Speed|distance|        prediction|
+--------------------------+--------+------------------+
|             [42.44,11.12]|    4.21|3.2169683689589643|
|              [44.96,6.59]|    3.59|3.2954512973977823|
|             [45.32,12.37]|    3.57|3.1958723619872194|
|              [46.58,5.28]|    3.79| 3.318286946304629|
|             [46.94,13.05]|    2.75| 3.184404485713751|
+--------------------------+--------+------------------+
only showing top 5 rows



In [10]:
pred_g.predictions.show(5)

+--------------------------+--------+------------------+
|Temperature and Wind Speed|distance|        prediction|
+--------------------------+--------+------------------+
|              [18.86,7.95]|    6.79|3.9327664949393446|
|              [20.3,19.32]|    3.67|3.7364516838604622|
|               [24.26,0.0]|    4.56|3.9661557476349842|
|             [24.26,17.03]|    3.58|3.7070843644685523|
|             [24.44,11.74]|    5.01|3.7846408921368755|
+--------------------------+--------+------------------+
only showing top 5 rows



In [15]:
mae_y = pred_y.meanAbsoluteError
mse_y = pred_y.meanSquaredError
r2_y = pred_y.r2

In [16]:
print(mae_y, mse_y, r2_y)

0.5415221852589678 0.48329265452104 -0.008410419504502498


In [19]:
mae_g = pred_g.meanAbsoluteError
mse_g = pred_g.meanSquaredError
r2_g = pred_g.r2

In [20]:
print(mae_g, mse_g, r2_g)

0.6084796210480276 0.8209022233190288 0.08844721087960761
