In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NYCTaxiTripDurationRegression").getOrCreate()

In [18]:
data = spark.read.csv("train.csv", header=True, inferSchema=True)
data.show(5)

[Stage 94:>                                                       (0 + 12) / 12]

+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|id2875421|        2|2016-03-14 17:24:55|2016-03-14 17:32:30|              1| -73.9821548461914| 40.76793670654297|-73.96463012695312|40.765602111816406|                 N|          455|
|id2377394|        1|2016-06-12 00:43:35|2016-06-12 00:54:38|              1|-73.98041534423828|40.738563537597656|-73.99948120117188| 40.73115158081055|                 N|          663|
|id3858529|        2|2016-01-19 11:35:24|2016-01-19 12:10:48|    

                                                                                

In [19]:
from pyspark.sql.functions import hour, minute, dayofweek, month, sqrt, pow

data = data.withColumn("pickup_minutes", hour("pickup_datetime") * 60 + minute("pickup_datetime")) \
            .withColumn("pickup_dayofweek", dayofweek("pickup_datetime")) \
            .withColumn("pickup_month", month("pickup_datetime")) \
            .withColumn("distance", sqrt(
                pow(data["pickup_longitude"] - data["dropoff_longitude"], 2) +
                pow(data["pickup_latitude"] - data["dropoff_latitude"], 2)
            ))

data = data.filter("passenger_count > 0") \
            .filter("trip_duration < 22 * 3600") \
            .filter("distance > 0")

In [23]:
from pyspark.ml.feature import VectorAssembler

feature_columns = [
    "passenger_count",
    "pickup_longitude",
    "pickup_latitude",
    "distance",
    "pickup_minutes",
    "pickup_dayofweek",
    "pickup_month",
]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(data)

In [24]:
from pyspark.ml.regression import DecisionTreeRegressor

train, test = assembled_data.randomSplit([0.8, 0.2], seed=42)

train = train.select("features", "trip_duration")

test = test.select("features", "trip_duration")

dt = DecisionTreeRegressor(
    featuresCol="features", 
    labelCol="trip_duration", 
    maxDepth=10,
    minInstancesPerNode=4, 
    seed=42)

model = dt.fit(train)

                                                                                

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(test)
predictions.show(5)

# Evaluate RMSE
rmse_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)
print("RMSE on test data =", rmse)

# Evaluate R²
r2_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)
print("R² on test data =", r2)

                                                                                

+--------------------+-------------+------------------+
|            features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,-73.97711944...|         1134|1050.1267576221635|
|[1.0,-73.97360992...|          592| 535.0468597461763|
|[1.0,-73.96556091...|         1677|1396.4377101143023|
|[1.0,-73.95832824...|          303|253.82171809212593|
|[1.0,-73.98651123...|          189| 351.0723589001447|
+--------------------+-------------+------------------+
only showing top 5 rows



                                                                                

RMSE on test data = 641.355633375002


[Stage 152:====>                                                  (1 + 11) / 12]

R² on test data = 0.4316665906009668


                                                                                