# Linear regression vs SVR vs Decision Trees on NYC Taxi dataset

In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import time
import joblib
import os

In [16]:
#Load dataset offline. dataset can be downloaded from: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
df = pd.read_parquet('datasets/nyc_ytaxi_ride.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3475226 entries, 0 to 3475225
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [17]:
print(df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2025-01-01 00:18:38   2025-01-01 00:26:59              1.0   
1         1  2025-01-01 00:32:40   2025-01-01 00:35:13              1.0   
2         1  2025-01-01 00:44:04   2025-01-01 00:46:01              1.0   
3         2  2025-01-01 00:14:27   2025-01-01 00:20:01              3.0   
4         2  2025-01-01 00:21:34   2025-01-01 00:25:06              3.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.60         1.0                  N           229           237   
1           0.50         1.0                  N           236           237   
2           0.60         1.0                  N           141           141   
3           0.52         1.0                  N           244           244   
4           0.66         1.0                  N           244           116   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


## Feature engineering and data cleaning

In [18]:
# Convert the datetime columns to pandas datetime format if not already
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Calculate trip duration in minutes (or your preferred unit)
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [19]:

# Ensure datetime columns are in the correct format
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Verify trip_duration is correct (in minutes)
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Data cleaning: Remove invalid durations (e.g., negative or zero)
df = df[df['trip_duration'] > 0]

# Feature engineering: Extract time-based features
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek  # 0=Mon, 6=Sun
df['is_rush_hour'] = df['pickup_hour'].apply(lambda x: 1 if x in [7, 8, 16, 17, 18] else 0)

# List of features to remove
features_to_remove = [
    'VendorID',
    'tpep_pickup_datetime',
    'tpep_dropoff_datetime',
    'store_and_fwd_flag',
    'payment_type',
    'fare_amount',
    'extra',
    'mta_tax',
    'tip_amount',
    'tolls_amount',
    'improvement_surcharge',
    'total_amount'
]

# Drop irrelevant features
df = df.drop(columns=features_to_remove)

# Final features
print("Remaining features:", df.columns.tolist())

# Optional: Save the processed dataset
# df.to_csv('processed_nyc_taxi_data.csv', index=False)

Remaining features: ['passenger_count', 'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID', 'congestion_surcharge', 'Airport_fee', 'cbd_congestion_fee', 'trip_duration', 'pickup_hour', 'pickup_day_of_week', 'is_rush_hour']


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3473175 entries, 0 to 3475225
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   passenger_count       float64
 1   trip_distance         float64
 2   RatecodeID            float64
 3   PULocationID          int32  
 4   DOLocationID          int32  
 5   congestion_surcharge  float64
 6   Airport_fee           float64
 7   cbd_congestion_fee    float64
 8   trip_duration         float64
 9   pickup_hour           int32  
 10  pickup_day_of_week    int32  
 11  is_rush_hour          int64  
dtypes: float64(7), int32(4), int64(1)
memory usage: 291.5 MB


In [21]:
# Remove extreme outliers (e.g., durations > 3 hours or distance > 100 miles)
df = df[(df['trip_duration'] <= 180) & (df['trip_distance'] <= 100)]

In [22]:
df = pd.get_dummies(df, columns=['RatecodeID'], prefix='Ratecode')

In [23]:
scaler = StandardScaler()
df[['trip_distance']] = scaler.fit_transform(df[['trip_distance']])

In [24]:
df.head()

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,congestion_surcharge,Airport_fee,cbd_congestion_fee,trip_duration,pickup_hour,pickup_day_of_week,is_rush_hour,Ratecode_1.0,Ratecode_2.0,Ratecode_3.0,Ratecode_4.0,Ratecode_5.0,Ratecode_6.0,Ratecode_99.0
0,1.0,-0.362362,229,237,2.5,0.0,0.0,8.35,0,2,0,True,False,False,False,False,False,False
1,1.0,-0.629109,236,237,2.5,0.0,0.0,2.55,0,2,0,True,False,False,False,False,False,False
2,1.0,-0.604859,141,141,2.5,0.0,0.0,1.95,0,2,0,True,False,False,False,False,False,False
3,3.0,-0.624259,244,244,0.0,0.0,0.0,5.566667,0,2,0,True,False,False,False,False,False,False
4,3.0,-0.590309,244,116,0.0,0.0,0.0,3.533333,0,2,0,True,False,False,False,False,False,False


In [25]:
df = df.dropna()

In [26]:
X = df.drop(columns=['trip_duration'])
y = df['trip_duration']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Training and Inference
Support vector regressor takes substantial amount of time, I waited for 8 hours, the training wasn't complete. So I abandoned the task. But feel free to run it, caveat is that you may have to wait for a loooong time.

In [27]:
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

In [28]:
# Define models
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Bagging Trees': BaggingRegressor(estimator=DecisionTreeRegressor(random_state=42), n_estimators=100, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    # 'XG Boost': XGBRegressor(n_estimators=100, random_state=42)
}


results = []
dataset_prefix = 'nyc_taxi'

# Train and evaluate each model
for name, model in models.items():
    # Measure training time
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    metrics = {
        'Model': name,
        'Train R²': r2_score(y_train, y_train_pred),
        'Test R²': r2_score(y_test, y_test_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Test MAE': mean_absolute_error(y_test, y_test_pred),
        'Training Time (s)': training_time
    }
    results.append(metrics)
    # Save the models
    model_filename = os.path.join(models_dir, f"{dataset_prefix}_{name.lower().replace(' ', '_')}.pkl")
    joblib.dump(model, model_filename)

# Display results
results_df = pd.DataFrame(results)
print(results_df.round(3))

                   Model  Train R²  Test R²  Train RMSE  Test RMSE  Train MAE  \
0          Decision Tree     0.997    0.708       0.619      6.355      0.141   
1  Bagging Trees (deg=2)     0.976    0.845       1.811      4.632      1.094   
2          Random Forest     0.976    0.845       1.811      4.632      1.094   

   Test MAE  Training Time (s)  
0     3.736              8.001  
1     2.784            579.849  
2     2.784          13328.463  


In [32]:
#save results
os.makedirs('output', exist_ok=True)
results_df.to_csv('output/nyctaxi_trees_results.csv', index=False)