In [5]:
#pip install pyarrow

In [6]:
# Import necessary libraries
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [7]:
# Function to read and preprocess data
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df['PULocationID'] = df['PULocationID'].astype('category')
    df['DOLocationID'] = df['DOLocationID'].astype('category')
    return df

In [8]:
# Load and prepare training and validation data
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('data/yellow_tripdata_2023-02.parquet')
target = 'duration'
categorical_columns = ['PULocationID', 'DOLocationID']

<div style="background-color: green; color: white; padding: 10px;">

The purpose of the talk is purely focused on MLOps. I have selected only a few columns in order to keep the model and training process simple.

</div>



In [18]:
df_train['duration'].describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration, dtype: float64

In [11]:
# Vectorize categorical features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train[categorical_columns].to_dict(orient='records'))
X_val = dv.transform(df_val[categorical_columns].to_dict(orient='records'))
y_train = df_train[target].values
y_val = df_val[target].values

In [16]:
print("Dimensionality (number of columns):", X_val.shape[1])

Dimensionality (number of columns): 2


In [12]:
# Train a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_mse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
print("Linear Regression MSE:", lr_mse)

Linear Regression MSE: 9.963607595829973




In [13]:
# Train a decision tree model
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_mse = mean_squared_error(y_val, tree.predict(X_val), squared=False)
print("Decision Tree MSE:", tree_mse)

Decision Tree MSE: 5.2199528727357345




In [14]:
# Save the models and DictVectorizer
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
# Define a PyTorch model for deep learning
class TaxiTripModel(nn.Module):
    def __init__(self):
        super(TaxiTripModel, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        return self.output(x)


In [None]:
# Convert data to PyTorch tensors and create DataLoaders
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

In [None]:
# Train the PyTorch model
model = TaxiTripModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            preds = model(X_batch)
            loss = loss_fn(preds, y_batch.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = sum(loss_fn(model(X_val), y_val.unsqueeze(1)) for X_val, y_val in val_loader) / len(val_loader)
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

In [None]:
train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10)

# Save the trained PyTorch model and DictVectorizer
with open('models/TaxiTripModel.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)