# Modelling 

In [23]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor



In [2]:
# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

24/08/29 18:28:47 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 172.16.119.20 instead (on interface en0)
24/08/29 18:28:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/29 18:28:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Dataset

In [3]:
# import the data
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [4]:
# Load the data
df = spark.read.parquet("../data/curated/tlc_data/first_cleaned.parquet")

                                                                                

In [5]:
# Drop missing values
df = df.dropna()

In [6]:
# compare the % of rows before and after dropping missing values
print(f"Number of rows before dropping missing values: {df.count()}")
print(f"Number of rows after dropping missing values: {df.dropna().count()}")
print(f"Percentage of rows removed: {100*(1 - df.dropna().count()/df.count()):.2f}%")

24/08/29 18:28:51 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Number of rows before dropping missing values: 21586888


                                                                                

Number of rows after dropping missing values: 21586888




Percentage of rows removed: 0.00%


                                                                                

## Feature selection

In [7]:
# List of columns to drop
columns_to_drop = [
    'VendorID', 'passenger_count', 'RatecodeID', 'store_and_fwd_flag',
    'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
    'tolls_amount', 'improvement_surcharge', 'total_amount', 
    'congestion_surcharge', 'ehail_fee', 'DOLocationID', 'DOBorough','dropoff_hour',
    'trip_distance', 'trip_duration'
]

# Dropping all specified columns at once in PySpark
df = df.drop(*columns_to_drop) 

In [8]:
# check the schema
df.printSchema()

root
 |-- PULocationID: integer (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- dropoff_date: date (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- hourly_trip_count: long (nullable = true)
 |-- daily_trip_count: long (nullable = true)
 |-- CIG: double (nullable = true)
 |-- WND: double (nullable = true)
 |-- VIS: double (nullable = true)
 |-- TMP: double (nullable = true)
 |-- DEW: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- Number of Events: long (nullable = true)



## Data split

In [9]:
# Splitting the data into training (60%), validation (20%), and test (20%)
train_df, validation_df, test_df = df.randomSplit([0.6, 0.2, 0.2], seed=42)

In [10]:
train_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in train_df.columns]).show()

                                                                                

+------------+-----------+-----------+------------+---------+-----------------+----------------+---+---+---+---+---+---+----------------+
|PULocationID|pickup_date|pickup_hour|dropoff_date|PUBorough|hourly_trip_count|daily_trip_count|CIG|WND|VIS|TMP|DEW|SLP|Number of Events|
+------------+-----------+-----------+------------+---------+-----------------+----------------+---+---+---+---+---+---+----------------+
|           0|          0|          0|           0|        0|                0|               0|  0|  0|  0|  0|  0|  0|               0|
+------------+-----------+-----------+------------+---------+-----------------+----------------+---+---+---+---+---+---+----------------+



## Linear regression

In [11]:
# Convert DOLocationID and PULocationID to string
train_df = train_df.withColumn("PULocationID", col("PULocationID").cast("string"))
validation_df = validation_df.withColumn("PULocationID", col("PULocationID").cast("string"))
test_df = test_df.withColumn("PULocationID", col("PULocationID").cast("string"))

In [12]:
# List of categorical and numerical columns
categorical_columns = ['PUBorough', 'PULocationID'] 
numerical_columns = [
    'pickup_hour', 
    'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events'
]

In [13]:
# Indexing and Encoding categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep") for col in categorical_columns]
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_ohe") for col in categorical_columns]

In [14]:
# Assemble the feature vector
assembler = VectorAssembler(
    inputCols=[
        'pickup_hour', 
        'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events',
        'PUBorough_ohe', 'PULocationID_ohe'
    ], 
    outputCol="features"
)


In [15]:
# Standardization
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [16]:
# Linear Regression
lr = LinearRegression(featuresCol="scaled_features", labelCol="hourly_trip_count")

In [17]:
# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

In [18]:
# Fit the model on the training set
model = pipeline.fit(train_df)

24/08/29 18:29:29 WARN Instrumentation: [06c2e25a] regParam is zero, which might cause numerical instability and overfitting.
24/08/29 18:29:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/29 18:29:43 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/08/29 18:29:43 WARN Instrumentation: [06c2e25a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [19]:
# Mean hourly trip count
mean_hourly_trip_count = train_df.agg(F.mean("hourly_trip_count")).head()[0]
print(f"Mean hourly trip count: {mean_hourly_trip_count}")



Mean hourly trip count: 5005.513692548508


                                                                                

In [20]:
# Evaluate the model on the validation set
validation_predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(labelCol="hourly_trip_count", predictionCol="prediction", metricName="rmse")
validation_rmse = evaluator.evaluate(validation_predictions)
print(f"Root Mean Squared Error (RMSE) on validation data = {validation_rmse}")



Root Mean Squared Error (RMSE) on validation data = 1250.82091726507


                                                                                

In [21]:
# If satisfied with validation performance, evaluate on the test set
test_predictions = model.transform(test_df)
test_rmse = evaluator.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse}")



Root Mean Squared Error (RMSE) on test data = 1251.5906354522426


                                                                                

## Gradient Boosting


In [24]:
# Gradient Boosted Trees Regressor
gbt = GBTRegressor(featuresCol="scaled_features", labelCol="hourly_trip_count", maxIter=100, maxDepth=5)

# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, gbt])

# Fit the model on the training set
model = pipeline.fit(train_df)


24/08/29 18:42:18 WARN DAGScheduler: Broadcasting large task binary with size 1153.0 KiB
24/08/29 18:42:19 WARN MemoryStore: Not enough space to cache rdd_162_4 in memory! (computed 163.1 MiB so far)
24/08/29 18:42:21 WARN MemoryStore: Not enough space to cache rdd_162_2 in memory! (computed 244.6 MiB so far)
24/08/29 18:42:21 WARN MemoryStore: Not enough space to cache rdd_162_1 in memory! (computed 244.6 MiB so far)
24/08/29 18:42:21 WARN MemoryStore: Not enough space to cache rdd_162_5 in memory! (computed 244.6 MiB so far)
24/08/29 18:42:21 WARN MemoryStore: Not enough space to cache rdd_162_3 in memory! (computed 244.6 MiB so far)
24/08/29 18:42:21 WARN MemoryStore: Not enough space to cache rdd_162_0 in memory! (computed 244.6 MiB so far)
24/08/29 18:42:24 WARN MemoryStore: Not enough space to cache rdd_162_7 in memory! (computed 163.1 MiB so far)
24/08/29 18:42:25 WARN MemoryStore: Not enough space to cache rdd_162_6 in memory! (computed 369.4 MiB so far)
24/08/29 18:44:49 WARN 

KeyboardInterrupt: 

24/08/29 18:55:34 WARN MemoryStore: Not enough space to cache rdd_222_7 in memory! (computed 788.0 MiB so far)
24/08/29 18:55:35 WARN MemoryStore: Not enough space to cache rdd_222_7 in memory! (computed 232.5 MiB so far)
24/08/29 18:55:35 WARN MemoryStore: Not enough space to cache rdd_222_7 in memory! (computed 12.4 MiB so far)
24/08/29 18:55:35 WARN MemoryStore: Not enough space to cache rdd_292_3 in memory! (computed 29.2 MiB so far)
24/08/29 18:55:35 WARN MemoryStore: Not enough space to cache rdd_270_7 in memory! (computed 2.4 MiB so far)
24/08/29 18:55:35 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_292_7 in memory.
24/08/29 18:55:35 WARN MemoryStore: Not enough space to cache rdd_292_7 in memory! (computed 384.0 B so far)
24/08/29 18:55:35 WARN BlockManager: Persisting block rdd_292_7 to disk instead.
24/08/29 18:55:35 WARN MemoryStore: Not enough space to cache rdd_295_3 in memory! (computed 1537.3 KiB so far)
24/08/29 18:5

In [None]:
# Evaluate the model on the validation set
validation_predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(labelCol="hourly_trip_count", predictionCol="prediction", metricName="rmse")
validation_rmse = evaluator.evaluate(validation_predictions)
print(f"Root Mean Squared Error (RMSE) on validation data = {validation_rmse}")

# If satisfied with validation performance, evaluate on the test set
test_predictions = model.transform(test_df)
test_rmse = evaluator.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse}")

## Interpret the model

### Identify which features are most important in making predictions