In [4]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *

# Cell to create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 ASSIGNMENT 1 DUSTIN")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [5]:
import pandas as pd
import statsmodels.api as sm

In [11]:
yellow = spark.read.parquet('../../mast30034-project-1-dustintano10/data/curated/yellow/yellow_indexed')
yellow_sample = pd.read_parquet('../../mast30034-project-1-dustintano10/data/curated/yellow/yellow_sample')

In [None]:
# take a sample of the indexed dataframe

SAMPLE_SIZE = 0.10

yellow_indexed_sample = yellow_indexed.sample(SAMPLE_SIZE, seed=0)

sample_pandas = yellow_indexed_sample.toPandas()

In [14]:
# standardized the x columns code from tutorial 2


from scipy.stats import zscore

x_cols = ['fare_amount','tolls_amount', 'total_amount', 'is_weekend_binary', 'trip_length', 
          'Attendance', 'Win_binary', 'margin_victory/loss','pickup_hour', 'dropoff_hour', 
          'PULocationID', 'DOLocationID', 'Start(ET)_NUMERIC']

y_cols = ['tip_amount']

df_standard = yellow_sample[x_cols].astype(float).apply(zscore)

# format output to 4 decimal places
pd.options.display.float_format = '{:,.4f}'.format
df_standard.describe().loc[['mean','std']]



Unnamed: 0,fare_amount,tolls_amount,total_amount,is_weekend_boolean,trip_length,Attendance,Win_boolean,margin_victory/loss,pickup_hour,dropoff_hour,PULocationID,DOLocationID,Start(ET)_NUMERIC
mean,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
# fit the lasso model code from tutorial 2

import numpy as np
from glmnet import ElasticNet

elastic_net_model_lasso = ElasticNet(alpha=1) 
elastic_net_model_lasso.fit(
    df_standard.values, 
    # flatten the array (from 2d matrix to 1d vector) to remove the warning message:
    # A column-vector y was passed when a 1d array was expected.
    yellow_sample[y_cols].values.flatten()
)

In [17]:
# fit the ridge model code from tutorial 2

elastic_net_model_ridge = ElasticNet(alpha=0) 
elastic_net_model_ridge.fit(
    df_standard.values, 
    # flatten the array (from 2d matrix to 1d vector) to remove the warning message:
    # A column-vector y was passed when a 1d array was expected.
    yellow_sample[y_cols].values.flatten()
)

In [18]:
best_lambda_lasso = elastic_net_model_lasso.lambda_best_[0]
best_lambda_ridge = elastic_net_model_ridge.lambda_best_[0]

print(f'Best lambda value for LASSO:', best_lambda_lasso)
print(f'Best lambda value for Ridge:', best_lambda_ridge)


Best lambda value for LASSO: 0.0032187193511177053
Best lambda value for Ridge: 0.11301540780402779


In [20]:
# drop the tpep pickup and dropoff columns, and categorical columns which have not been indexed

yellow = yellow.drop('tpep_pickup_datetime', 'tpep_dropoff_datetime', 'Start(ET)',
                     'Win', 'is_weekend')

In [22]:
# use all 2018 data as a train set

# then use the first game of 2019 as the test set

yellow_train = yellow.where((F.col('Date') < '2019-01-11'))

yellow_test = yellow.where((F.col('Date') == '2019-01-11'))


In [23]:
# drop Date columns for both train and test

yellow_train = yellow_train.drop('Date')

yellow_test = yellow_test.drop('Date')

In [24]:
# create a vector for the train dataframe ridge model

from pyspark.ml.feature import VectorAssembler

features = 'features'

# use all featuresn
input_cols_full = ['fare_amount','tolls_amount', 'trip_distance', 'total_amount', 'is_weekend_binary', 'trip_length', 
                   'Win_binary','PULocationID', 'DOLocationID', 'pickup_hour', 'dropoff_hour','Attendance', 
                    'margin_victory/loss', 'Start(ET)_NUMERIC']

vectorAssembler_full = VectorAssembler(
    inputCols = input_cols_full, 
    outputCol = features)


v_yellow_train_full = vectorAssembler_full.transform(yellow_train.dropna('any'))

v_yellow_train_full = v_yellow_train_full.select(['features', 'tip_amount'])

v_yellow_train_full.show(3)

22/08/25 16:04:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------------------+----------+
|            features|tip_amount|
+--------------------+----------+
|[16.5,0.0,2.56,22...|      4.58|
|[5.0,0.0,0.9,7.25...|      1.45|
|[4.0,0.0,0.63,5.7...|      0.96|
+--------------------+----------+
only showing top 3 rows



In [25]:
v_yellow_test_full = vectorAssembler_full.transform(yellow_test.dropna('any'))

v_yellow_test_full = v_yellow_test_full.select(['features', 'tip_amount'])

v_yellow_test_full.show(3)

+--------------------+----------+
|            features|tip_amount|
+--------------------+----------+
|[13.0,0.0,2.88,18...|       3.7|
|(14,[0,2,3,5,7,8,...|      2.26|
|(14,[0,2,3,5,7,8,...|      1.86|
+--------------------+----------+
only showing top 3 rows



In [26]:
# LassoM stands for LASSO model

from pyspark.ml.regression import LinearRegression

LassoM = LinearRegression(featuresCol = 'features', labelCol = 'tip_amount', 
                      regParam = best_lambda_lasso, elasticNetParam = 1).fit(v_yellow_train_full)

22/08/25 16:04:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/08/25 16:04:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

In [27]:
#coefficients for LASSO regression

pd.DataFrame(
    data=[LassoM.intercept] + list(LassoM.coefficients),
    index=['intercept'] + input_cols_full,
    columns=['coefficient']
)

# based on LASSO, Attendance, margin_victory/loss, dropoff_hour, and trip_length are all irrelevant

Unnamed: 0,coefficient
intercept,-0.7828
fare_amount,-0.8181
tolls_amount,-0.8164
trip_distance,-0.036
total_amount,0.8522
is_weekend_boolean,0.1261
trip_length,0.0
Win_boolean,0.0
PULocationID,0.0001
DOLocationID,0.0001


In [28]:
trainingSummary_LassoM = LassoM.summary

print("RMSE: %f" % trainingSummary_LassoM.rootMeanSquaredError)
print("r2: %f" % trainingSummary_LassoM.r2)
print("MSE: %f" % trainingSummary_LassoM.meanSquaredError)
print("MAE: %f" % trainingSummary_LassoM.meanAbsoluteError)


RMSE: 0.333291
r2: 0.933292
MSE: 0.111083
MAE: 0.262705


In [29]:
v_yellow_train_full.describe().show()



+-------+-----------------+
|summary|       tip_amount|
+-------+-----------------+
|  count|          2398168|
|   mean|2.042468496785995|
| stddev|1.290430556720419|
|    min|              0.0|
|    max|            411.0|
+-------+-----------------+



                                                                                

In [32]:
features = 'features'

# features selected after selection through LASSO
input_cols_reduced = ['fare_amount', 'trip_distance', 'total_amount', 'tolls_amount', 'is_weekend_binary',
                     'PULocationID', 'DOLocationID', 'pickup_hour','dropoff_hour', 'Start(ET)_NUMERIC']


vectorAssembler_reduced = VectorAssembler(
    inputCols = input_cols_reduced, 
    outputCol = features)


v_yellow_train_reduced = vectorAssembler_reduced.transform(yellow_train.dropna('any'))

v_yellow_train_reduced = v_yellow_train_reduced.select(['features', 'tip_amount'])

v_yellow_train_reduced.show(3)

+--------------------+----------+
|            features|tip_amount|
+--------------------+----------+
|[16.5,2.56,22.88,...|      4.58|
|[5.0,0.9,7.25,0.0...|      1.45|
|[4.0,0.63,5.76,0....|      0.96|
+--------------------+----------+
only showing top 3 rows



In [33]:
v_yellow_test_reduced = vectorAssembler_reduced.transform(yellow_test.dropna('any'))

v_yellow_test_reduced = v_yellow_test_reduced.select(['features', 'tip_amount'])

v_yellow_test_reduced.show(3)

+--------------------+----------+
|            features|tip_amount|
+--------------------+----------+
|[13.0,2.88,18.5,0...|       3.7|
|(10,[0,1,2,5,6],[...|      2.26|
|(10,[0,1,2,5,6],[...|      1.86|
+--------------------+----------+
only showing top 3 rows



In [34]:
lm = LinearRegression(featuresCol='features', labelCol = 'tip_amount').fit(v_yellow_train_reduced)

22/08/25 16:07:04 WARN Instrumentation: [fce5c780] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

22/08/25 16:07:06 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [35]:
pd.DataFrame(
    data=[lm.intercept] + list(lm.coefficients),
    index=['intercept'] + input_cols_reduced,
    columns=['coefficient']
)

Unnamed: 0,coefficient
intercept,-0.8346
fare_amount,-0.8597
trip_distance,-0.0392
total_amount,0.8881
tolls_amount,-0.8666
is_weekend_boolean,0.1391
PULocationID,0.0001
DOLocationID,0.0002
pickup_hour,-0.0239
dropoff_hour,0.0066


In [37]:
trainingSummary_lm = lm.summary

print("RMSE: %f" % trainingSummary_lm.rootMeanSquaredError)
print("r2: %f" % trainingSummary_lm.r2)
print("MSE: %f" % trainingSummary_lm.meanSquaredError)
print("MAE: %f" % trainingSummary_lm.meanAbsoluteError)


RMSE: 0.330728
r2: 0.934314
MSE: 0.109381
MAE: 0.266606


In [38]:
from pyspark.ml.evaluation import RegressionEvaluator


lm_predictions = lm.transform(v_yellow_test_reduced)
lm_predictions.select("prediction","tip_amount","features").show(5)

+------------------+----------+--------------------+
|        prediction|tip_amount|            features|
+------------------+----------+--------------------+
|  4.23433509619992|       3.7|[13.0,2.88,18.5,0...|
|2.5591914948221306|      2.26|(10,[0,1,2,5,6],[...|
|2.1824115482452453|      1.86|(10,[0,1,2,5,6],[...|
| 2.713104899806294|      2.49|(10,[0,1,2,5,6],[...|
| 4.304380112562949|      4.06|(10,[0,1,2,5,6],[...|
+------------------+----------+--------------------+
only showing top 5 rows



In [39]:
test_result_lm = lm.evaluate(v_yellow_test_reduced)

print("R Squared (R2) on test data = %g" % test_result_lm.r2)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result_lm.rootMeanSquaredError)
print("Mean Squared Error (MSE) on test data = %g" % test_result_lm.meanSquaredError)
print("Mean Absolute Error (MAE) on test data = %g" % test_result_lm.meanAbsoluteError)



R Squared (R2) on test data = 0.931308
Root Mean Squared Error (RMSE) on test data = 0.352211
Mean Squared Error (MSE) on test data = 0.124052
Mean Absolute Error (MAE) on test data = 0.286736


                                                                                

In [40]:
# rm stands for ridge model

rm = LinearRegression(featuresCol = 'features', labelCol = 'tip_amount', 
                      regParam = best_lambda_ridge, elasticNetParam = 0).fit(v_yellow_train_full)

                                                                                

In [41]:
pd.DataFrame(
    data=[rm.intercept] + list(rm.coefficients),
    index=['intercept'] + input_cols_full,
    columns=['coefficient']
)

Unnamed: 0,coefficient
intercept,-0.0753
fare_amount,-0.0926
tolls_amount,-0.1027
trip_distance,-0.0599
total_amount,0.2495
is_weekend_boolean,0.0006
trip_length,-0.0058
Win_boolean,0.0003
PULocationID,0.0001
DOLocationID,0.0001


In [42]:
trainingSummary_rm = rm.summary

print("RMSE: %f" % trainingSummary_rm.rootMeanSquaredError)
print("r2: %f" % trainingSummary_rm.r2)
print("MSE: %f" % trainingSummary_rm.meanSquaredError)
print("MAE: %f" % trainingSummary_rm.meanAbsoluteError)


RMSE: 0.792833
r2: 0.622520
MSE: 0.628584
MAE: 0.402514


In [43]:
rm_predictions = rm.transform(v_yellow_test_full)
rm_predictions.select("prediction","tip_amount","features").show(5)

+------------------+----------+--------------------+
|        prediction|tip_amount|            features|
+------------------+----------+--------------------+
|3.0218192860164197|       3.7|[13.0,0.0,2.88,18...|
|2.1321807034003672|      2.26|(14,[0,2,3,5,7,8,...|
|1.7744462009012867|      1.86|(14,[0,2,3,5,7,8,...|
|1.7925143205911493|      2.49|(14,[0,2,3,5,7,8,...|
| 3.782685029998886|      4.06|(14,[0,2,3,5,7,8,...|
+------------------+----------+--------------------+
only showing top 5 rows



In [44]:
test_result_rm = rm.evaluate(v_yellow_test_full)

print("R Squared (R2) on test data = %g" % test_result_rm.r2)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result_rm.rootMeanSquaredError)
print("Mean Squared Error (MSE) on test data = %g" % test_result_rm.meanSquaredError)
print("Mean Absolute Error (MAE) on test data = %g" % test_result_rm.meanAbsoluteError)



R Squared (R2) on test data = 0.595039
Root Mean Squared Error (RMSE) on test data = 0.855176
Mean Squared Error (MSE) on test data = 0.731325
Mean Absolute Error (MAE) on test data = 0.37337


                                                                                