# Modelling 

In [5]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [6]:
# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

## Dataset

In [17]:
# import the data
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

In [18]:
# Load the data
df = spark.read.parquet("../data/curated/tlc_data/first_cleaned.parquet")

In [22]:
# get features
features = df.columns
# print the features
print(features)

['DOLocationID', 'PULocationID', 'pickup_date', 'pickup_hour', 'dropoff_date', 'dropoff_hour', 'VendorID', 'passenger_count', 'trip_distance', 'trip_duration', 'RatecodeID', 'store_and_fwd_flag', 'PUBorough', 'DOBorough', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'ehail_fee', 'hourly_trip_count', 'daily_trip_count', 'CIG', 'WND', 'VIS', 'TMP', 'DEW', 'SLP', 'Number of Events']


## Data split

In [19]:
# Splitting the data into training (60%), validation (20%), and test (20%)
train_df, validation_df, test_df = df.randomSplit([0.6, 0.2, 0.2], seed=42)

## Linear regression

In [None]:
# Example categorical columns
categorical_columns = ['categorical_column1', 'categorical_column2']

# Indexers: Converting categorical string columns to numeric indices
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_columns]

# Encoders: Converting numeric indices to one-hot encoded vectors
encoders = [OneHotEncoder(inputCol=column + "_index", outputCol=column + "_ohe") for column in categorical_columns]

In [None]:
# Assemble the feature vector
assembler = VectorAssembler(inputCols=[<your_selected_columns>], outputCol="features")

# Standardization
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Linear Regression
lr = LinearRegression(featuresCol="scaled_features", labelCol="hourly_trip_count")

# Pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

# Fit the model on the training set
model = pipeline.fit(train_df)

# Evaluate the model on the validation set
validation_predictions = model.transform(validation_df)
evaluator = RegressionEvaluator(labelCol="hourly_trip_count", predictionCol="prediction", metricName="rmse")
validation_rmse = evaluator.evaluate(validation_predictions)
print(f"Root Mean Squared Error (RMSE) on validation data = {validation_rmse}")

# If satisfied with validation performance, evaluate on the test set
test_predictions = model.transform(test_df)
test_rmse = evaluator.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {test_rmse}")


## Random Forest 