In [1]:
!pip install pandas scikit-learn pyarrow


Collecting pandas
  Downloading pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting pyarrow
  Downloading pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl.metadata (3.2 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl (10.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd

# Yellow taxi Jan 2023
jan_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
df_jan = pd.read_parquet(jan_url)

df_jan.shape


(3066766, 19)

In [3]:
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

df_jan['duration'].std()


np.float64(42.59435124195457)

In [4]:
df_clean = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

len(df_clean) / len(df_jan)


0.9812202822125979

In [5]:
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_clean[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

X_train.shape


(3009173, 515)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

y_train = df_clean['duration'].values

model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_train)

rmse_train = np.sqrt(mean_squared_error(y_train, preds))
rmse_train


np.float64(7.6492619593358615)

In [7]:
feb_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"
df_feb = pd.read_parquet(feb_url)

df_feb['duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60
df_feb_clean = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]


In [8]:
val_dicts = df_feb_clean[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_feb_clean['duration'].values


In [9]:
pred_val = model.predict(X_val)

rmse_val = np.sqrt(mean_squared_error(y_val, pred_val))
rmse_val


np.float64(7.811816898533222)