In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, minute, dayofweek, month, sqrt, pow
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("NYCTaxiTripDurationRegression").getOrCreate()

# Data preparation

In [4]:
data = spark.read.csv("../../../data/train.csv", header=True, inferSchema=True)
data.show(5)

                                                                                

+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1| -73.9821548461914| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|id2377394|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423828|40.738563537597656|-73.99948120117188| 40.73115158081055|                 N|          663|
|id3858529|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|    

# Data preprocessing

- **Columns**:
  - `id`: Unique identifier for each trip.
  - `vendor_id`: ID of the taxi vendor.
  - `pickup_datetime` and `dropoff_datetime`: Timestamps for the start and end of the trip.
  - `passenger_count`: Number of passengers in the taxi.
  - `pickup_longitude` and `pickup_latitude`: GPS coordinates of the pickup location.
  - `dropoff_longitude` and `dropoff_latitude`: GPS coordinates of the dropoff location.
  - `store_and_fwd_flag`: Whether the trip record was held in the vehicle's memory before sending to the server (`Y` or `N`).
  - `trip_duration`: Duration of the trip in seconds.

1. **Feature Extraction**:
  - Extracted additional features such as:
    - `pickup_minutes`: Total minutes from the start of the day.
    - `pickup_dayofweek`: Day of the week.
    - `pickup_month`: Month of the year.
    - `distance`: Euclidean distance between pickup and dropoff locations.

2. **Filtering Invalid Data**:
  - Removed trips with:
    - `passenger_count` less than or equal to 0.
    - `trip_duration` greater than 22 hours (extreme outliers).
    - `distance` less than or equal to 0.

3. **Feature Assembly**:
  - Combined relevant features into a single vector using `VectorAssembler`. The selected features include:
    - `passenger_count`
    - `pickup_longitude`
    - `pickup_latitude`
    - `distance`
    - `pickup_minutes`
    - `pickup_dayofweek`
    - `pickup_month`

4. **Data Transformation**:
  - Transformed the data into a format suitable for machine learning by creating a `features` column and retaining the target variable `trip_duration`.

In [5]:
data = data.withColumn("pickup_minutes", hour("pickup_datetime") * 60 + minute("pickup_datetime")) \
            .withColumn("pickup_dayofweek", dayofweek("pickup_datetime")) \
            .withColumn("pickup_month", month("pickup_datetime")) \
            .withColumn("distance", sqrt(
                pow(data["pickup_longitude"] - data["dropoff_longitude"], 2) +
                pow(data["pickup_latitude"] - data["dropoff_latitude"], 2)
            ))

data = data.filter("passenger_count > 0") \
            .filter("trip_duration < 22 * 3600") \
            .filter("distance > 0")

In [6]:
feature_columns = [
    "passenger_count",
    "pickup_longitude",
    "pickup_latitude",
    "distance",
    "pickup_minutes",
    "pickup_dayofweek",
    "pickup_month",
]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(data)

# Model Training
- Split the data into training and testing sets.
- Trained a Decision Tree Regressor model using the training data.

In [7]:
assembled_data = assembled_data.select("features", "trip_duration")
train, test = assembled_data.randomSplit([0.8, 0.2], seed=42)

dt = DecisionTreeRegressor(
    featuresCol="features", 
    labelCol="trip_duration", 
    maxDepth=10,
    minInstancesPerNode=10, 
    seed=42)

model = dt.fit(train)

                                                                                

# Model Evaluation
  - Display the structure of the trained Decision Tree model.
  - Analyze the importance of each feature used in the model.
  - Evaluate the model's performance on the test data using metrics such as RMSE, MAE, R^2.

In [None]:
# Display the structure of the trained Decision Tree model
print(model.toDebugString)


DecisionTreeRegressionModel: uid=DecisionTreeRegressor_11b551bbac23, depth=10, numNodes=1957, numFeatures=7
  If (feature 3 <= 0.04326144704693165)
   If (feature 3 <= 0.01964442331284995)
    If (feature 3 <= 0.012479203006089328)
     If (feature 3 <= 0.008896234091292792)
      If (feature 2 <= 40.768693923950195)
       If (feature 4 <= 487.5)
        If (feature 0 <= 4.5)
         If (feature 2 <= 40.675804138183594)
          If (feature 1 <= -74.00863647460938)
           Predict: 4341.823529411765
          Else (feature 1 > -74.00863647460938)
           If (feature 5 <= 6.5)
            Predict: 173.91772151898735
           Else (feature 5 > 6.5)
            Predict: 516.3103448275862
         Else (feature 2 > 40.675804138183594)
          If (feature 1 <= -74.0005111694336)
           If (feature 4 <= 365.5)
            Predict: 262.20385947550716
           Else (feature 4 > 365.5)
            Predict: 460.42314049586776
          Else (feature 1 > -74.0005111694336)
    

In [None]:
# Analyze the importance of each feature
for name, score in zip(feature_columns, model.featureImportances):
    print(f"{name}: {score}")

passenger_count: 0.0018307729216504756
pickup_longitude: 0.014158306840187264
pickup_latitude: 0.009428885032823246
distance: 0.8539077587500385
pickup_minutes: 0.08235450270570706
pickup_dayofweek: 0.02988974396392313
pickup_month: 0.008430029785670413


In [9]:
predictions = model.transform(test)
predictions.show(10)

# Evaluate MAE
mae_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="mae")
mae = mae_evaluator.evaluate(predictions)
print("MAE on test data =", mae)

# Evaluate RMSE
rmse_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)
print("RMSE on test data =", rmse)


# Evaluate R²
r2_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)
print("R² on test data =", r2)

                                                                                

+--------------------+-------------+------------------+
|            features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,-74.34706878...|           22|262.20385947550716|
|[1.0,-74.15783691...|           97|269.85399594677943|
|[1.0,-74.07962036...|          912| 656.8158499234303|
|[1.0,-74.03789520...|          290| 309.1060063643596|
|[1.0,-74.02536010...|         1582| 2862.211155378486|
|[1.0,-74.01790618...|          253| 367.7642677751797|
|[1.0,-74.01776885...|         2964|2699.3333333333335|
|[1.0,-74.01737976...|         1171|1694.4221635883905|
|[1.0,-74.01728057...|          705| 859.3492103410119|
|[1.0,-74.01727294...|          849| 549.1835645677695|
+--------------------+-------------+------------------+
only showing top 10 rows



                                                                                

MAE on test data = 234.37723035998286


                                                                                

RMSE on test data = 615.5684958134535


[Stage 32:===>                                                    (1 + 15) / 16]

R² on test data = 0.44791151771176485


                                                                                