In [1]:
import gc
gc.collect()

4

In [2]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer

In [3]:

# Caminho para a pasta scripts
os.path.join(os.path.abspath(os.path.join(os.getcwd(), '.')), 'scripts')


'/home/mlflow/Documents/mlops-zoomcamp/cohorts/2025/01-intro/scripts'

In [4]:
from data import read_data, get_data, optimize_dtypes, filter_vect_data

<div class="alert alert-block alert-info">
    <p><b>Q1. </b> Downloading the data</p>
</div>

In [5]:
data_files = [get_data(month, 2023, "yellow") for month in range(1,3)]

data/yellow_tripdata_2023-01.parquet: 100%|██████████| 45.5M/45.5M [00:04<00:00, 10.2MiB/s]
data/yellow_tripdata_2023-02.parquet: 100%|██████████| 45.5M/45.5M [00:04<00:00, 9.91MiB/s]


In [6]:
file_path = f'data'
file_name = f'yellow_tripdata_2023-01'
df = optimize_dtypes(read_data(file_path, file_name))

In [7]:
df.head();

In [8]:
print(f'DataFrame with {df.shape[0]} rows and {df.shape[1]} columns.')

DataFrame with 3066766 rows and 19 columns.


<div class="alert alert-block alert-info">
    <p><b>Q2. </b>Computing duration</p>
</div>

In [9]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].dt.total_seconds() / 60

In [10]:
print(round(df['duration'].mean(), 2), round(df['duration'].std(), 2))

15.67 42.59


<div class="alert alert-block alert-info">
    <p><b>Q3. </b>Dropping outliers</p>
</div>

In [11]:
# Calculate the fraction of records left
fraction_left = len(df[(df.duration >= 1) & (df.duration <= 60)]) / len(df) * 100
print(f'Fraction of records left: {fraction_left:.4f}')

Fraction of records left: 98.1220


In [12]:
# Filter records with duration between 1 and 60 minutes (inclusive)
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy(deep=True)

<div class="alert alert-block alert-info">
    <p><b>Q4. </b>One-hot encoding</p>
</div>

In [13]:
df.head();

In [14]:
categorical = ['PULocationID','DOLocationID']
df[categorical] = df[categorical].astype(str)

In [15]:
train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [16]:
# df_dummies = pd.get_dummies(
#     df,
#     columns=columns,
#     prefix_sep='_',
#     drop_first=True,
#     dtype=np.int8,
#     sparse=True) # for large datasets)

In [17]:
print(f'Feature matrix size: {X_train.shape}')

Feature matrix size: (3009173, 515)


<div class="alert alert-block alert-info">
    <p><b>Q5. </b>Training a model</p>
</div>

In [18]:
target = 'duration'
y_train = np.asarray(df[target])

In [19]:
from data import training_data

In [20]:
y_pred, out = training_data(X_train, y=y_train)

Train RMSE: 7.649262223700304


<div class="alert alert-block alert-info">
    <p><b>Q6. </b>Evaluating the model</p>
</div>

In [21]:
file_path = f'data'
file_name = f'yellow_tripdata_2023-02'
dx = optimize_dtypes(read_data(file_path, file_name))

In [22]:
dx["duration"] = dx.tpep_dropoff_datetime - dx.tpep_pickup_datetime
dx.duration = dx.duration.dt.total_seconds() / 60
dx = dx[(dx["duration"] >= 1) & (dx["duration"] <= 60)].copy(deep=True)

In [23]:
X_val = filter_vect_data(dx)

In [24]:
target = 'duration'
y_val = np.asarray(dx[target])

In [25]:
X_val.shape[0], y_val.shape[0]

(2855951, 2855951)

In [26]:
y_pred, out = training_data(X_val, y=y_val)

Train RMSE: 7.778948571135243
