# Modelling 

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType


In [5]:
# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

24/08/30 02:10:37 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 100.94.176.147 instead (on interface en0)
24/08/30 02:10:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 02:10:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/30 02:10:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Dataset

In [6]:
# import the data
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [7]:
# Load the data
df = spark.read.parquet("../data/curated/tlc_data/first_cleaned.parquet")

                                                                                

In [8]:
# Drop missing values
df = df.dropna()

In [9]:
# compare the % of rows before and after dropping missing values
print(f"Number of rows before dropping missing values: {df.count()}")
print(f"Number of rows after dropping missing values: {df.dropna().count()}")
print(f"Percentage of rows removed: {100*(1 - df.dropna().count()/df.count()):.2f}%")

Number of rows before dropping missing values: 38461
Number of rows after dropping missing values: 38461
Percentage of rows removed: 0.00%


## Feature selection

In [10]:
# List of columns to drop
columns_to_drop = [
    'VendorID', 'passenger_count', 'RatecodeID', 'store_and_fwd_flag',
    'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
    'tolls_amount', 'improvement_surcharge', 'total_amount', 
    'congestion_surcharge', 'ehail_fee', 'DOLocationID', 'DOBorough','dropoff_hour',
    'trip_distance', 'trip_duration'
]

# Dropping all specified columns at once in PySpark
df = df.drop(*columns_to_drop) 

In [11]:
# check the schema
df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- hourly_trip_count: long (nullable = true)
 |-- Number of Events: long (nullable = true)
 |-- CIG: double (nullable = true)
 |-- WND: double (nullable = true)
 |-- VIS: double (nullable = true)
 |-- TMP: double (nullable = true)
 |-- DEW: double (nullable = true)
 |-- SLP: double (nullable = true)



## Data split

In [13]:
from pyspark.sql.functions import col, month

# Filter the data by months for training, validation, and test sets
train_df = df.filter((month(col("pickup_date")).isin(7, 8, 9, 10)))
validation_df = df.filter((month(col("pickup_date")) == 11))
test_df = df.filter((month(col("pickup_date")) == 12))

In [14]:
train_df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in train_df.columns]).show()

+--------+---------+-----------+-----------+-----------------+----------------+---+---+---+---+---+---+
|datetime|PUBorough|pickup_date|pickup_hour|hourly_trip_count|Number of Events|CIG|WND|VIS|TMP|DEW|SLP|
+--------+---------+-----------+-----------+-----------------+----------------+---+---+---+---+---+---+
|       0|        0|          0|          0|                0|               0|  0|  0|  0|  0|  0|  0|
+--------+---------+-----------+-----------+-----------------+----------------+---+---+---+---+---+---+



## Linear regression

In [15]:
# List of categorical and numerical columns
categorical_columns = ['PUBorough'] 
numerical_columns = [
    'pickup_hour', 
    'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events'
]

In [16]:
# Indexing and Encoding categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep") for col in categorical_columns]
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_ohe") for col in categorical_columns]

In [17]:
# Assemble the feature vector
assembler = VectorAssembler(
    inputCols=[
        'pickup_hour', 
        'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events',
        'PUBorough_ohe'
    ], 
    outputCol="features"
)


In [18]:
# Standardization
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [19]:
# Linear Regression
lr = LinearRegression(featuresCol="scaled_features", labelCol="hourly_trip_count")

In [20]:
# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

In [21]:
# Fit the model on the training set
model = pipeline.fit(train_df)

24/08/30 02:10:57 WARN Instrumentation: [039ef752] regParam is zero, which might cause numerical instability and overfitting.
24/08/30 02:10:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/08/30 02:10:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/08/30 02:10:58 WARN Instrumentation: [039ef752] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


In [22]:
# Mean hourly trip count
mean_hourly_trip_count = train_df.agg(F.mean("hourly_trip_count")).head()[0]
print(f"Mean hourly trip count: {mean_hourly_trip_count}")

Mean hourly trip count: 792.0786824907984


In [23]:
# Evaluate the model on the validation set
validation_predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(labelCol="hourly_trip_count", predictionCol="prediction", metricName="rmse")
validation_rmse = evaluator.evaluate(validation_predictions)
print(f"Root Mean Squared Error (RMSE) on validation data = {validation_rmse}")

Root Mean Squared Error (RMSE) on validation data = 1001.8514312623018


In [24]:
# If satisfied with validation performance, evaluate on the test set
test_predictions = model.transform(test_df)
test_rmse = evaluator.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse}")

Root Mean Squared Error (RMSE) on test data = 1056.3164933934179


### Feature Analysis 

In [31]:
coefficients = model.coefficients

AttributeError: 'PipelineModel' object has no attribute 'coefficients'

In [None]:
# List of features used
features = [
    'pickup_hour', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events', 'PUBorough_ohe'
]

# Combine features with their corresponding coefficients
feature_coefficients = [(features[i], float(coefficients[i])) for i in range(len(features))]

# Define schema explicitly
schema = StructType([
    StructField("Feature", StringType(), True),
    StructField("Coefficient", DoubleType(), True)
])

# Convert coefficients to a DataFrame with the specified schema
feature_coefficients_df = spark.createDataFrame(feature_coefficients, schema=schema)

# Show the coefficients sorted by their absolute value
feature_coefficients_df.orderBy(abs(feature_coefficients_df.Coefficient), ascending=False).show()


## Gradient Boosting


In [26]:
# Gradient Boosted Trees Regressor
gbt = GBTRegressor(featuresCol="scaled_features", labelCol="hourly_trip_count", maxIter=100, maxDepth=5)

# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, gbt])

# Fit the model on the training set
model = pipeline.fit(train_df)




CodeCache: size=131072Kb used=42807Kb max_used=43159Kb free=88264Kb
 bounds [0x00000001071e8000, 0x0000000109c58000, 0x000000010f1e8000]
 total_blobs=16249 nmethods=14321 adapters=1839
 compilation: disabled (not enough contiguous free space left)


In [28]:
# Evaluate the model on the validation set
validation_predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(labelCol="hourly_trip_count", predictionCol="prediction", metricName="rmse")
validation_rmse = evaluator.evaluate(validation_predictions)
print(f"Root Mean Squared Error (RMSE) on validation data = {validation_rmse}")

# If satisfied with validation performance, evaluate on the test set
test_predictions = model.transform(test_df)
test_rmse = evaluator.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse}")

Root Mean Squared Error (RMSE) on validation data = 920.9909098926162
Root Mean Squared Error (RMSE) on test data = 1080.0764412742965


### Feature analysis 

In [29]:
coefficients = model.coefficients

AttributeError: 'PipelineModel' object has no attribute 'coefficients'

In [27]:
# List of features used
features = [
    'pickup_hour', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events', 'PUBorough_ohe'
]

# Combine features with their corresponding coefficients
feature_coefficients = [(features[i], float(coefficients[i])) for i in range(len(features))]

# Define schema explicitly
schema = StructType([
    StructField("Feature", StringType(), True),
    StructField("Coefficient", DoubleType(), True)
])

# Convert coefficients to a DataFrame with the specified schema
feature_coefficients_df = spark.createDataFrame(feature_coefficients, schema=schema)

# Show the coefficients sorted by their absolute value
feature_coefficients_df.orderBy(abs(feature_coefficients_df.Coefficient), ascending=False).show()


NameError: name 'coefficients' is not defined

## Interpret the model