In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import log, monotonically_increasing_id
from pyspark.ml.stat import Correlation
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
spark = (
    SparkSession.builder.appName("model")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.extraJavaOptions", "-XX:+UseCompressedOops") \ 
    .config("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/08/23 12:08:24 WARN Utils: Your hostname, MinhVu resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/23 12:08:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 12:08:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading train data

In [3]:
train = spark.read.parquet('../data/curated/train_data')


                                                                                

#### Log-transformation

In [4]:
min_tmp = train.agg(F.min('tmp')).first()[0]
min_dew = train.agg(F.min('dew')).first()[0]

train = train.withColumn('log_miles', log(F.col('trip_miles') + 1))
train = train.withColumn('log_time', log(F.col('trip_time') + 1))
train = train.withColumn('log_tolls', log(F.col('tolls') + 1))
train = train.withColumn('log_wnd', log(F.col('wnd') + 1))
train = train.withColumn('log_tmp', log(F.col('tmp') - min_tmp + 1))
train = train.withColumn('log_dew', log(F.col('dew') - min_dew + 1))
train = train.withColumn('log_slp', log(F.col('slp') + 1))
train = train.withColumn('log_base_fare', log(F.col('base_passenger_fare') + 1))


                                                                                

#### Vector of selected features

In [5]:
features = 'features'
input_cols = ['hour', 'pulocationid', 'dolocationid', 'is_weekend', 'log_miles', 
              'log_time', 'log_tolls', 'log_wnd', 'log_tmp', 'log_dew', 'log_slp']


assembler = VectorAssembler(
    # which column to combine
    inputCols=input_cols, 
    # How should the combined columns be named
    outputCol=features
)

model = assembler.transform(train)

#### Linear Regression with Lasso regularization

In [6]:
lm = LinearRegression(
    featuresCol='features', 
    labelCol='log_base_fare',
    elasticNetParam=1
).fit(model)

24/08/23 12:08:57 WARN Instrumentation: [647b0d1c] regParam is zero, which might cause numerical instability and overfitting.
24/08/23 12:09:00 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/23 12:09:33 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [8]:
print(f"RMSE on train data: {lm.summary.rootMeanSquaredError}")
print(f"R2 on train data: {lm.summary.r2}")

RMSE on train data: 0.25411668727313813
R2 on train data: 0.7885271538245844


#### Reading test data

In [7]:
test = spark.read.parquet('../data/curated/test_data')

In [8]:
min_tmp_test = test.agg(F.min('tmp')).first()[0]
min_dew_test = test.agg(F.min('dew')).first()[0]

test = test.withColumn('log_miles', log(F.col('trip_miles') + 1))
test = test.withColumn('log_time', log(F.col('trip_time') + 1))
test = test.withColumn('log_tolls', log(F.col('tolls') + 1))
test = test.withColumn('log_wnd', log(F.col('wnd') + 1))
test = test.withColumn('log_tmp', log(F.col('tmp') - min_tmp_test + 1))
test = test.withColumn('log_dew', log(F.col('dew') - min_dew_test + 1))
test = test.withColumn('log_slp', log(F.col('slp') + 1))
test = test.withColumn('log_base_fare', log(F.col('base_passenger_fare') + 1))


In [9]:
test_model = assembler.transform(test)

In [25]:
predictions_lm = lm.transform(test_model)

rmse_evaluator = RegressionEvaluator(
    labelCol='log_base_fare', predictionCol='prediction', metricName='rmse')
rmse = rmse_evaluator.evaluate(predictions_lm)

r2_evaluator = RegressionEvaluator(
    labelCol='log_base_fare', predictionCol='prediction', metricName='r2')
r2 = r2_evaluator.evaluate(predictions_lm)

print(f"RMSE on test data = {rmse}")
print(f"R2 on test data = {r2}")



RMSE on test data = 0.2697466224952964
R2 on test data = 0.7790835852393875


                                                                                

[Stage 247:>                                                        (0 + 1) / 1]

### Gradient Boost Tree

In [11]:
gbt = GBTRegressor(
    featuresCol='features',
    labelCol='log_base_fare',
).fit(model)

24/08/23 12:11:57 WARN MemoryStore: Not enough space to cache rdd_104_6 in memory! (computed 144.4 MiB so far)
24/08/23 12:11:57 WARN BlockManager: Persisting block rdd_104_6 to disk instead.
24/08/23 12:11:57 WARN MemoryStore: Not enough space to cache rdd_104_3 in memory! (computed 144.4 MiB so far)
24/08/23 12:11:57 WARN BlockManager: Persisting block rdd_104_3 to disk instead.
24/08/23 12:11:57 WARN MemoryStore: Not enough space to cache rdd_104_11 in memory! (computed 144.4 MiB so far)
24/08/23 12:11:57 WARN BlockManager: Persisting block rdd_104_11 to disk instead.
24/08/23 12:11:57 WARN MemoryStore: Not enough space to cache rdd_104_5 in memory! (computed 144.4 MiB so far)
24/08/23 12:11:57 WARN BlockManager: Persisting block rdd_104_5 to disk instead.
24/08/23 12:11:57 WARN MemoryStore: Not enough space to cache rdd_104_7 in memory! (computed 144.4 MiB so far)
24/08/23 12:11:57 WARN BlockManager: Persisting block rdd_104_7 to disk instead.
24/08/23 12:11:57 WARN MemoryStore: No

In [13]:
predictions = gbt.transform(model)
evaluator_rmse = RegressionEvaluator(labelCol="log_base_fare", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="log_base_fare", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"RMSE on train data = {rmse}")
print(f"R2 on train data = {r2}")



RMSE on train data = 0.23685939393961208
R2 on train data = 0.8162744874810608


                                                                                

In [26]:
predictions_gbt = gbt.transform(test_model)
evaluator_rmse = RegressionEvaluator(labelCol="log_base_fare", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="log_base_fare", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predictions_gbt)
r2 = evaluator_r2.evaluate(predictions_gbt)

print(f"RMSE on test data = {rmse}")
print(f"R2 on test data = {r2}")

24/08/23 13:34:53 WARN BasicWriteTaskStatsTracker: Expected 1 files, but only saw 0. This could be due to the output format not writing empty files, or files being not immediately visible in the filesystem.
24/08/23 13:34:53 ERROR FileFormatWriter: Aborting job 238ebd14-a351-4125-adea-283fc19b252e.
java.io.FileNotFoundException: File file:/mnt/c/Users/ADMIN/Desktop/project-1-individual-MaveVu/data/curated/test_pred/_temporary/0 does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:597)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.j

RMSE on test data = 0.252767076620354
R2 on test data = 0.806019994823415


                                                                                

Adding id since spark dataframe does not guarantee the order of rows

In [27]:
test = test.withColumn('id1', monotonically_increasing_id())
predictions_gbt = predictions_gbt.withColumn('id1', monotonically_increasing_id())
predictions_lm = predictions_lm.withColumn('id1', monotonically_increasing_id())

In [28]:
predictions_gbt = predictions_gbt.select("id1", "prediction").withColumnRenamed("prediction", "pred_gbt")
predictions_lm = predictions_lm.select("id1", "prediction").withColumnRenamed("prediction", "pred_lm")

#### Merge the predictions into the dataframe and store it

In [23]:
sel_test = test.select('date', 'hour', 'pulocationid', 'dolocationid', 'is_weekend', 'trip_miles', 
              'trip_time', 'tolls', 'wnd', 'tmp', 'dew', 'slp', 'base_passenger_fare')
merged_df = sel_test.join(predictions_gbt, 'id1', 'inner') \
                .join(predictions_lm, 'id1', 'inner')

In [29]:
merged_df \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet('../data/curated/test_pred')

                                                                                

In [30]:
spark.stop()