In [15]:
#pip install pyarrow


In [16]:
# Import necessary libraries
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import mlflow


In [17]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1', creation_time=1716364782674, experiment_id='1', last_update_time=1716364782674, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [18]:
#experiment_id = mlflow.create_experiment("New Experiment Name")
#mlflow.set_experiment(experiment_id)

In [19]:
# Function to read and preprocess data
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df['PULocationID'] = df['PULocationID'].astype('category')
    df['DOLocationID'] = df['DOLocationID'].astype('category')
    return df

In [20]:
# Load and prepare training and validation data
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('data/yellow_tripdata_2023-02.parquet')
target = 'duration'
categorical_columns = ['PULocationID', 'DOLocationID']

<div style="background-color: green; color: white; padding: 10px;">

The purpose of the talk is purely focused on MLOps. I have selected only a few columns in order to keep the model and training process simple.

</div>



In [21]:
df_train['duration'].describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration, dtype: float64

In [22]:
# Vectorize categorical features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train[categorical_columns].to_dict(orient='records'))
X_val = dv.transform(df_val[categorical_columns].to_dict(orient='records'))
y_train = df_train[target].values
y_val = df_val[target].values

In [23]:
print("Dimensionality (number of columns):", X_val.shape[1])

Dimensionality (number of columns): 2


## LinearRegression

In [24]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
print("Linear Regression MSE:", lr_rmse)

Linear Regression MSE: 9.963607595829973




In [25]:
from sklearn import linear_model
with mlflow.start_run():
    mlflow.set_tag('developer','Huseyn')
    mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
    alpha=0.01
    ls = linear_model.Lasso(alpha=alpha)
    ls.fit(X_train, y_train)
    ls_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
    mlflow.log_metric('rmse',ls_rmse)
    print("Linear Regression MSE:", ls_rmse)


Linear Regression MSE: 9.963607595829973




In [26]:

alpha_values = [0.01, 0.05, 0.1, 0.5, 1.0]
for alpha in alpha_values:
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Huseyn')
        mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
        mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
        mlflow.log_param('alpha', alpha)
        ls = linear_model.Lasso(alpha=alpha)
        ls.fit(X_train, y_train)
        ls_rmse = mean_squared_error(y_val, ls.predict(X_val), squared=False)
        mlflow.log_metric('rmse', ls_rmse)
        print(f"Linear Regression RMSE with alpha={alpha}: {ls_rmse}")




Linear Regression RMSE with alpha=0.01: 9.96360781849841




Linear Regression RMSE with alpha=0.05: 9.963608750857988




Linear Regression RMSE with alpha=0.1: 9.963610009929864




Linear Regression RMSE with alpha=0.5: 9.963623827395615
Linear Regression RMSE with alpha=1.0: 9.96365046141962




In [27]:
import os
import joblib
model_folder = 'models'
model_path = os.path.join(model_folder, 'lasso_model.pkl')
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
joblib.dump(ls, model_path)
print(f"Model saved successfully in {model_path}!")


Model saved successfully in models/lasso_model.pkl!


In [28]:
# Save the models and DictVectorizer
with open('mlruns/models/Lasso_model.pkl', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [29]:
 Xgboost model

SyntaxError: invalid syntax (626458882.py, line 1)

In [None]:
### xgboost model

In [32]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import root_mean_squared_error


In [33]:
import mlflow
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        model = xgb.XGBRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            learning_rate=params['learning_rate'],
            subsample=params['subsample'],
            gamma=params['gamma'],
            colsample_bytree=params['colsample_bytree']
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        mlflow.log_metric("rmse", rmse)
    return {'loss': -rmse, 'status': STATUS_OK}


In [34]:
space = {
    'max_depth': hp.choice('max_depth', range(1, 3)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.02),
    'n_estimators': hp.choice('n_estimators', range(100, 101)),
    'subsample': hp.uniform('subsample', 0.7, 0.8),
    'gamma': hp.uniform('gamma', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}


In [35]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)
print("Best hyperparameters:", best)


  2%|▏         | 1/50 [00:06<05:04,  6.21s/trial, best loss: -9.810796785956194]




  4%|▍         | 2/50 [00:13<05:31,  6.91s/trial, best loss: -9.810796785956194]




  6%|▌         | 3/50 [00:19<05:02,  6.44s/trial, best loss: -9.810796785956194]




  8%|▊         | 4/50 [00:25<04:51,  6.33s/trial, best loss: -9.814717837920595]




 10%|█         | 5/50 [00:33<05:06,  6.82s/trial, best loss: -9.814717837920595]




 12%|█▏        | 6/50 [00:41<05:14,  7.14s/trial, best loss: -9.814717837920595]




 14%|█▍        | 7/50 [00:46<04:45,  6.64s/trial, best loss: -9.814717837920595]




 16%|█▌        | 8/50 [00:52<04:30,  6.45s/trial, best loss: -9.814717837920595]




 18%|█▊        | 9/50 [00:59<04:34,  6.70s/trial, best loss: -9.814717837920595]




 20%|██        | 10/50 [01:07<04:40,  7.01s/trial, best loss: -9.814717837920595]




 22%|██▏       | 11/50 [01:13<04:22,  6.73s/trial, best loss: -9.814717837920595]




 24%|██▍       | 12/50 [01:21<04:28,  7.07s/trial, best loss: -9.814717837920595]




 26%|██▌       | 13/50 [01:29<04:27,  7.22s/trial, best loss: -9.814717837920595]




 28%|██▊       | 14/50 [01:36<04:22,  7.28s/trial, best loss: -9.814717837920595]




 30%|███       | 15/50 [01:42<04:02,  6.93s/trial, best loss: -9.814717837920595]




 32%|███▏      | 16/50 [01:50<04:00,  7.06s/trial, best loss: -9.814717837920595]




 34%|███▍      | 17/50 [01:57<03:59,  7.26s/trial, best loss: -9.814717837920595]




 36%|███▌      | 18/50 [02:05<03:55,  7.35s/trial, best loss: -9.814717837920595]




 38%|███▊      | 19/50 [02:13<03:51,  7.47s/trial, best loss: -9.814717837920595]




 40%|████      | 20/50 [02:19<03:31,  7.06s/trial, best loss: -9.81481083965742] 




 42%|████▏     | 21/50 [02:25<03:17,  6.80s/trial, best loss: -9.834179751099503]




 44%|████▍     | 22/50 [02:31<03:05,  6.63s/trial, best loss: -9.834179751099503]




 46%|████▌     | 23/50 [02:37<02:55,  6.50s/trial, best loss: -9.834440126760905]




 48%|████▊     | 24/50 [02:44<02:46,  6.41s/trial, best loss: -9.834440126760905]




 50%|█████     | 25/50 [02:50<02:36,  6.28s/trial, best loss: -9.834440126760905]




 52%|█████▏    | 26/50 [02:56<02:29,  6.21s/trial, best loss: -9.834440126760905]




 54%|█████▍    | 27/50 [03:02<02:23,  6.25s/trial, best loss: -9.834440126760905]




 56%|█████▌    | 28/50 [03:08<02:15,  6.16s/trial, best loss: -9.834440126760905]




 58%|█████▊    | 29/50 [03:14<02:09,  6.15s/trial, best loss: -9.834440126760905]




 60%|██████    | 30/50 [03:20<02:02,  6.15s/trial, best loss: -9.834440126760905]




 62%|██████▏   | 31/50 [03:26<01:57,  6.18s/trial, best loss: -9.834440126760905]




 64%|██████▍   | 32/50 [03:32<01:49,  6.10s/trial, best loss: -9.834440126760905]




 66%|██████▌   | 33/50 [03:38<01:42,  6.03s/trial, best loss: -9.834440126760905]




 68%|██████▊   | 34/50 [03:44<01:35,  5.96s/trial, best loss: -9.834440126760905]




 70%|███████   | 35/50 [03:50<01:29,  6.00s/trial, best loss: -9.834440126760905]




 72%|███████▏  | 36/50 [03:56<01:24,  6.05s/trial, best loss: -9.834440126760905]




 74%|███████▍  | 37/50 [04:02<01:18,  6.07s/trial, best loss: -9.834440126760905]




 76%|███████▌  | 38/50 [04:08<01:13,  6.08s/trial, best loss: -9.834440126760905]




 78%|███████▊  | 39/50 [04:14<01:05,  5.96s/trial, best loss: -9.834440126760905]




 80%|████████  | 40/50 [04:20<00:59,  6.00s/trial, best loss: -9.834440126760905]




 82%|████████▏ | 41/50 [04:26<00:54,  6.05s/trial, best loss: -9.834440126760905]




 84%|████████▍ | 42/50 [04:33<00:49,  6.16s/trial, best loss: -9.834440126760905]




 86%|████████▌ | 43/50 [04:39<00:42,  6.12s/trial, best loss: -9.834440126760905]




 88%|████████▊ | 44/50 [04:45<00:36,  6.08s/trial, best loss: -9.834440126760905]




 90%|█████████ | 45/50 [04:51<00:30,  6.05s/trial, best loss: -9.834440126760905]




 92%|█████████▏| 46/50 [04:59<00:26,  6.58s/trial, best loss: -9.834440126760905]




 94%|█████████▍| 47/50 [05:05<00:19,  6.45s/trial, best loss: -9.834440126760905]




 96%|█████████▌| 48/50 [05:12<00:13,  6.76s/trial, best loss: -9.834440126760905]




 98%|█████████▊| 49/50 [05:18<00:06,  6.56s/trial, best loss: -9.834440126760905]




100%|██████████| 50/50 [05:26<00:00,  6.53s/trial, best loss: -9.834440126760905]
Best hyperparameters: {'colsample_bytree': 0.3779536158461567, 'gamma': 0.09901277402577013, 'learning_rate': 0.010012443610260855, 'max_depth': 0, 'n_estimators': 0, 'subsample': 0.7002289555139877}





In [None]:
## Tree models

In [None]:
# Train a decision tree model
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_mse = mean_squared_error(y_val, tree.predict(X_val), squared=False)
print("Decision Tree MSE:", tree_mse)

Decision Tree MSE: 5.220358942859578




In [None]:
import xgboost as xgb

# Preparing training and validation datasets
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster=xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            early_stopping_rounds=50,
            evals=[(valid, "validation")]
        
        )
        y_pred=booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
space = {
    'max_depth': hp.choice('max_depth', range(1, 3)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.02),
    'n_estimators': hp.choice('n_estimators', range(100, 101)),
    'subsample': hp.uniform('subsample', 0.7, 0.8),
    'gamma': hp.uniform('gamma', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}


In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)
print("Best hyperparameters:", best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06474                          
[1]	validation-rmse:10.05663                          
[2]	validation-rmse:10.05380                          
[3]	validation-rmse:10.04595                          
[4]	validation-rmse:10.04321                          
[5]	validation-rmse:10.04045                          
[6]	validation-rmse:10.03286                          
[7]	validation-rmse:10.03025                          
[8]	validation-rmse:10.02291                          
[9]	validation-rmse:10.02029                          
[10]	validation-rmse:10.01780                         
[11]	validation-rmse:10.01528                         
[12]	validation-rmse:10.00818                         
[13]	validation-rmse:10.00133                         
[14]	validation-rmse:9.99466                          
[15]	validation-rmse:9.98821                          
[16]	validation-rmse:9.98196                          
[17]	validation-rmse:9.97954                          
[18]	valid

KeyboardInterrupt: 

In [None]:

class TaxiTripModel(nn.Module):
    def __init__(self):
        super(TaxiTripModel, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        return self.output(x)


In [None]:
# Convert data to PyTorch tensors and create DataLoaders
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

In [None]:
# Train the PyTorch model
model = TaxiTripModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            preds = model(X_batch)
            loss = loss_fn(preds, y_batch.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = sum(loss_fn(model(X_val), y_val.unsqueeze(1)) for X_val, y_val in val_loader) / len(val_loader)
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

In [None]:
train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10)

# Save the trained PyTorch model and DictVectorizer
with open('models/TaxiTripModel.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)