In [10]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [11]:
yellow_taxi_january_2023_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
yellow_taxi_february_2023_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

df = pd.read_parquet(yellow_taxi_january_2023_url)

print(len(df.columns))

19


Download the data for January and February 2023.

Q: Read the data for January. How many columns are there?

A: 19

In [12]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
df.duration.std()

np.float64(42.59435124195458)

Q2. Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.

Q: What's the standard deviation of the trips duration in January?

A: 42.59

In [13]:
# Convert duration to minutes first
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

# Calculate total number of records before filtering
total_records = len(df)

# Filter records between 1 and 60 minutes (inclusive)
# Convert duration to minutes
df_filtered = df[(df['duration'].dt.total_seconds()/60 >= 1) &
                 (df['duration'].dt.total_seconds()/60 <= 60)]

# Calculate number of records after filtering
filtered_records = len(df_filtered)


# Calculate the fraction
fraction = filtered_records / total_records

print(f"Original number of records: {total_records}")
print(f"Records after filtering: {filtered_records}")
print(f"Fraction remaining: {fraction:.4f}")
print(f"Percentage remaining: {fraction * 100:.2f}%")


Original number of records: 3066766
Records after filtering: 3009173
Fraction remaining: 0.9812
Percentage remaining: 98.12%


Q3. Dropping outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

Q: What fraction of the records left after you dropped the outliers?

A: 98%

In [14]:
# Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
# Fit a dictionary vectorizer
# Get a feature matrix from it
df = pd.read_parquet(yellow_taxi_january_2023_url)
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)
train_dicts = df[categorical + numerical].to_dict(orient='records')
# Create and fit the DictVectorizer
dv = DictVectorizer(sparse=True)
X = dv.fit_transform(train_dicts)

# Get number of columns (dimensionality)
n_features = X.shape[1]
print(f"Number of features (columns): {n_features}")

Number of features (columns): 516


Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
Fit a dictionary vectorizer
Get a feature matrix from it

Q: What's the dimensionality of this matrix (number of columns)?

A: 515

In [15]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X, y_train)

y_pred = lr.predict(X)

root_mean_squared_error(y_train, y_pred)

7.658403836937678

Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters, where duration is the response variable
Calculate the RMSE of the model on the training data

Q: What's the RMSE on train?

A: 7.64

In [16]:
feb_df = pd.read_parquet(yellow_taxi_february_2023_url)

feb_df['duration'] = feb_df.tpep_dropoff_datetime - feb_df.tpep_pickup_datetime
feb_df.duration = feb_df.duration.apply(lambda td: td.total_seconds() / 60)

feb_df = feb_df[(feb_df.duration >= 1) & (feb_df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

feb_df[categorical] = feb_df[categorical].astype(str)
val_dicts = feb_df[categorical + numerical].to_dict(orient='records')
# Use the same DictVectorizer that was fit on the training data
X_val = dv.transform(val_dicts)

target = 'duration'
feb_y_val = feb_df[target].values

feb_y_pred = lr.predict(X_val)

root_mean_squared_error(feb_y_val, feb_y_pred)

7.8201754113565425

Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2023).

Q: What's the RMSE on validation?

A: 7.81
