In [5]:
#pip install pyarrow


In [1]:
# Import necessary libraries
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import mlflow


In [42]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1', creation_time=1716364782674, experiment_id='1', last_update_time=1716364782674, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [46]:
#experiment_id = mlflow.create_experiment("New Experiment Name")
#mlflow.set_experiment(experiment_id)

<Experiment: artifact_location='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1', creation_time=1716364782674, experiment_id='1', last_update_time=1716364782674, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
# Function to read and preprocess data
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df['PULocationID'] = df['PULocationID'].astype('category')
    df['DOLocationID'] = df['DOLocationID'].astype('category')
    return df

In [4]:
# Load and prepare training and validation data
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('data/yellow_tripdata_2023-02.parquet')
target = 'duration'
categorical_columns = ['PULocationID', 'DOLocationID']

<div style="background-color: green; color: white; padding: 10px;">

The purpose of the talk is purely focused on MLOps. I have selected only a few columns in order to keep the model and training process simple.

</div>



In [22]:
df_train['duration'].describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration, dtype: float64

In [5]:
# Vectorize categorical features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train[categorical_columns].to_dict(orient='records'))
X_val = dv.transform(df_val[categorical_columns].to_dict(orient='records'))
y_train = df_train[target].values
y_val = df_val[target].values

In [24]:
print("Dimensionality (number of columns):", X_val.shape[1])

Dimensionality (number of columns): 2


## LinearRegression

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
print("Linear Regression MSE:", lr_rmse)

Linear Regression MSE: 9.963607595829973




In [10]:
from sklearn import linear_model
with mlflow.start_run():
    mlflow.set_tag('developer','Huseyn')
    mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
    alpha=0.01
    ls = linear_model.Lasso(alpha=alpha)
    ls.fit(X_train, y_train)
    ls_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
    mlflow.log_metric('rmse',ls_rmse)
    print("Linear Regression MSE:", ls_rmse)


Linear Regression MSE: 9.963607595829973




In [11]:

alpha_values = [0.01, 0.05, 0.1, 0.5, 1.0]
for alpha in alpha_values:
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Huseyn')
        mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
        mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
        mlflow.log_param('alpha', alpha)
        ls = linear_model.Lasso(alpha=alpha)
        ls.fit(X_train, y_train)
        ls_rmse = mean_squared_error(y_val, ls.predict(X_val), squared=False)
        mlflow.log_metric('rmse', ls_rmse)
        print(f"Linear Regression RMSE with alpha={alpha}: {ls_rmse}")




Linear Regression RMSE with alpha=0.01: 9.96360781849841




Linear Regression RMSE with alpha=0.05: 9.963608750857988




Linear Regression RMSE with alpha=0.1: 9.963610009929864




Linear Regression RMSE with alpha=0.5: 9.963623827395615
Linear Regression RMSE with alpha=1.0: 9.96365046141962




In [16]:
import os
model_folder = 'models'
model_path = os.path.join(model_folder, 'lasso_model.pkl')
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
joblib.dump(ls, model_path)
print(f"Model saved successfully in {model_path}!")


Model saved successfully in models/lasso_model.pkl!


In [20]:
# Save the models and DictVectorizer
with open('mlruns/models/Lasso_model.pkl', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

## Xgboost model

In [None]:
### xgboost model

In [38]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import root_mean_squared_error

def objective(params):
    model = xgb.XGBRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        gamma=params['gamma'],
        colsample_bytree=params['colsample_bytree']
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, preds)
    mlflow.log_metric("rmse", rmse)
    return {'loss': -rmse, 'status': STATUS_OK}


In [39]:
space = {
    'max_depth': hp.choice('max_depth', range(1, 3)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.02),
    'n_estimators': hp.choice('n_estimators', range(100, 101)),
    'subsample': hp.uniform('subsample', 0.7, 0.8),
    'gamma': hp.uniform('gamma', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}


In [40]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
print("Best hyperparameters:", best)


100%|██████████| 100/100 [10:41<00:00,  6.41s/trial, best loss: -9.834427286428758]
Best hyperparameters: {'colsample_bytree': 0.4542285431353108, 'gamma': 0.042327342471586406, 'learning_rate': 0.010014085853014157, 'max_depth': 0, 'n_estimators': 0, 'subsample': 0.7184586782650518}


In [None]:
## Tree models

In [21]:
# Train a decision tree model
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_mse = mean_squared_error(y_val, tree.predict(X_val), squared=False)
print("Decision Tree MSE:", tree_mse)

Decision Tree MSE: 5.220232344881768




NameError: name 'best_params' is not defined

In [None]:

class TaxiTripModel(nn.Module):
    def __init__(self):
        super(TaxiTripModel, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        return self.output(x)


In [None]:
# Convert data to PyTorch tensors and create DataLoaders
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

In [None]:
# Train the PyTorch model
model = TaxiTripModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            preds = model(X_batch)
            loss = loss_fn(preds, y_batch.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = sum(loss_fn(model(X_val), y_val.unsqueeze(1)) for X_val, y_val in val_loader) / len(val_loader)
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

In [None]:
train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10)

# Save the trained PyTorch model and DictVectorizer
with open('models/TaxiTripModel.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)