In [3]:
from pyspark.sql import SparkSession
from math import sqrt, pow

from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import RegressionMetrics

from datetime import datetime

spark = SparkSession.builder\
                    .appName("NYCTaxiTripDurationRegression")\
                    .master("local[*]")\
                    .config("spark.log.level", "ERROR")\
                    .getOrCreate()
sc = spark.sparkContext

# Data preparation

In [4]:
# Load data as an RDD
data_rdd = sc.textFile("../../../data/train.csv")

# Skip the header
header = data_rdd.first()
data_rdd = data_rdd.filter(lambda line: line != header)

                                                                                

# Data preprocessing

- **Columns**:
  - `id`: Unique identifier for each trip.
  - `vendor_id`: ID of the taxi vendor.
  - `pickup_datetime` and `dropoff_datetime`: Timestamps for the start and end of the trip.
  - `passenger_count`: Number of passengers in the taxi.
  - `pickup_longitude` and `pickup_latitude`: GPS coordinates of the pickup location.
  - `dropoff_longitude` and `dropoff_latitude`: GPS coordinates of the dropoff location.
  - `store_and_fwd_flag`: Whether the trip record was held in the vehicle's memory before sending to the server (`Y` or `N`).
  - `trip_duration`: Duration of the trip in seconds.

1. **Feature Extraction**:
  - Extracted additional features such as:
    - `pickup_minutes`: Total minutes from the start of the day.
    - `pickup_dayofweek`: Day of the week.
    - `pickup_month`: Month of the year.
    - `distance`: Euclidean distance between pickup and dropoff locations.

2. **Filtering Invalid Data**:
  - Removed trips with:
    - `passenger_count` less than or equal to 0.
    - `trip_duration` greater than 22 hours (extreme outliers).
    - `distance` less than or equal to 0.

3. **Feature Assembly**:
  - Combined relevant features into a single vector using `VectorAssembler`. The selected features include:
    - `passenger_count`
    - `pickup_longitude`
    - `pickup_latitude`
    - `distance`
    - `pickup_minutes`
    - `pickup_dayofweek`
    - `pickup_month`

4. **Data Transformation**:
  - Transformed the data into a format suitable for machine learning by creating a `features` column and retaining the target variable `trip_duration`.


In [5]:
def process_trip_line(line):
    fields = line.split(",")
    pickup_datetime = datetime.strptime(fields[2], "%Y-%m-%d %H:%M:%S") # pickup_datetime
    passenger_count = int(fields[4]) # passenger_count
    pickup_longitude = float(fields[5]) # pickup_longitude
    pickup_latitude = float(fields[6]) # pickup_latitude
    dropoff_longitude = float(fields[7]) # dropoff_longitude
    dropoff_latitude = float(fields[8]) # dropoff_latitude
    trip_duration = int(fields[10]) # trip_duration

    pickup_minutes = pickup_datetime.hour * 60 + pickup_datetime.minute
    pickup_dayofweek = pickup_datetime.weekday() + 1
    pickup_month = pickup_datetime.month
    distance = sqrt(pow((pickup_longitude - dropoff_longitude), 2) + pow((pickup_latitude - dropoff_latitude), 2))
    return [
        passenger_count,
        pickup_latitude,
        pickup_longitude,
        distance,
        pickup_minutes,
        pickup_dayofweek,
        pickup_month,
        trip_duration
    ]

data_rdd = data_rdd.map(process_trip_line)
data_rdd = data_rdd.filter(lambda x: x[0] > 0) \
                    .filter(lambda x: x[-1] < 22 * 3600) \
                    .filter(lambda x: x[3] > 0)
                    
data_rdd.take(1)

[[1,
  40.76793670654297,
  -73.9821548461914,
  0.01767953949959892,
  1044,
  1,
  3,
  455]]

In [6]:
# Convert to LabeledPoint RDD
labeled_rdd = data_rdd.map(lambda x: LabeledPoint(x[-1], x[:-1]))

# Model Training:
  - Split the data into training and testing sets.
  - Train a Decision Tree Regressor using the training data.

In [7]:

# Split into training and test data
train_rdd, test_rdd = labeled_rdd.randomSplit([0.8, 0.2], seed=42)

# Train the DecisionTree model
model = DecisionTree.trainRegressor(
    train_rdd,
    categoricalFeaturesInfo={},
    maxDepth=10,
    minInstancesPerNode=10
)

                                                                                

# Model Evaluation:
  - Evaluate the model's performance on the test data using metrics such as RMSE, MAE, R^2.

In [15]:
# Make predictions on the test set
predictions = model.predict(test_rdd.map(lambda x: x.features))
predictions_and_labels = predictions.zip(test_rdd.map(lambda lp: lp.label))


# Evaluate the model
metrics = RegressionMetrics(predictions_and_labels)

# Print evaluation metrics
print("Mean Absolute Error (MAE):", metrics.meanAbsoluteError)
print("Root Mean Squared Error (RMSE):", metrics.rootMeanSquaredError)
print("R-squared (R²):", metrics.r2)


# Convert predictions_and_labels to a DataFrame with column names
predictions_and_labels_df = predictions_and_labels.toDF(["prediction", "actual"])
predictions_and_labels_df.show(10)

                                                                                

Mean Absolute Error (MAE): 233.88415546499564
Root Mean Squared Error (RMSE): 640.9679652269182
R-squared (R²): 0.4303176661849354


[Stage 46:>                                                         (0 + 1) / 1]

+------------------+------+
|        prediction|actual|
+------------------+------+
|1072.0188804239815|1225.0|
| 1036.780043383948|1128.0|
|403.74746445382635| 211.0|
| 512.8794032648598| 251.0|
| 891.5253633123115| 652.0|
|2604.0903819918144|2485.0|
| 711.0153256704981| 694.0|
|1779.6998284734134|2331.0|
| 657.1538461538462| 559.0|
|1212.9560975609756|1647.0|
+------------------+------+
only showing top 10 rows



                                                                                