In [2]:
#pip install pyarrow


In [3]:
# Import necessary libraries
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import mlflow


In [4]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1', creation_time=1716364782674, experiment_id='1', last_update_time=1716364782674, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [5]:
#experiment_id = mlflow.create_experiment("New Experiment Name")
#mlflow.set_experiment(experiment_id)

In [6]:
# Function to read and preprocess data
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df['PULocationID'] = df['PULocationID'].astype('category')
    df['DOLocationID'] = df['DOLocationID'].astype('category')
    return df

In [7]:
# Load and prepare training and validation data
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('data/yellow_tripdata_2023-02.parquet')
target = 'duration'
categorical_columns = ['PULocationID', 'DOLocationID']

<div style="background-color: green; color: white; padding: 10px;">

The purpose of the talk is purely focused on MLOps. I have selected only a few columns in order to keep the model and training process simple.

</div>



In [8]:
df_train['duration'].describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration, dtype: float64

In [9]:
# Vectorize categorical features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train[categorical_columns].to_dict(orient='records'))
X_val = dv.transform(df_val[categorical_columns].to_dict(orient='records'))
y_train = df_train[target].values
y_val = df_val[target].values

In [10]:
print("Dimensionality (number of columns):", X_val.shape[1])

Dimensionality (number of columns): 2


In [11]:
##LinearRegression

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
print("Linear Regression MSE:", lr_rmse)

Linear Regression MSE: 9.963607595829973




In [13]:
 mlflow.end_run()

In [14]:
from sklearn import linear_model
with mlflow.start_run():
    mlflow.set_tag('developer','Huseyn')
    mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
    alpha=0.01
    ls = linear_model.Lasso(alpha=alpha)
    ls.fit(X_train, y_train)
    ls_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
    mlflow.log_metric('rmse',ls_rmse)
    print("Linear Regression MSE:", ls_rmse)


Linear Regression MSE: 9.963607595829973




In [15]:

alpha_values = [0.01, 0.05, 0.1, 0.5, 1.0]
for alpha in alpha_values:
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Huseyn')
        mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
        mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
        mlflow.log_param('alpha', alpha)
        ls = linear_model.Lasso(alpha=alpha)
        ls.fit(X_train, y_train)
        ls_rmse = mean_squared_error(y_val, ls.predict(X_val), squared=False)
        mlflow.log_metric('rmse', ls_rmse)
        print(f"Linear Regression RMSE with alpha={alpha}: {ls_rmse}")




Linear Regression RMSE with alpha=0.01: 9.96360781849841




Linear Regression RMSE with alpha=0.05: 9.963608750857988




Linear Regression RMSE with alpha=0.1: 9.963610009929864




Linear Regression RMSE with alpha=0.5: 9.963623827395615
Linear Regression RMSE with alpha=1.0: 9.96365046141962




In [16]:
import os
import joblib
model_folder = 'models'
model_path = os.path.join(model_folder, 'lasso_model.pkl')
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
joblib.dump(ls, model_path)
print(f"Model saved successfully in {model_path}!")


Model saved successfully in models/lasso_model.pkl!


In [17]:
mlflow.log_artifact(local_path="models/lasso_model.pkl", artifact_path='model')

In [18]:
# Save the models and DictVectorizer
with open('mlruns/models/Lasso_model.pkl', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

 #Xgboost model

In [19]:
### xgboost model

In [20]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import root_mean_squared_error


In [21]:
import mlflow
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        model = xgb.XGBRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            learning_rate=params['learning_rate'],
            subsample=params['subsample'],
            gamma=params['gamma'],
            colsample_bytree=params['colsample_bytree']
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        mlflow.log_metric("rmse", rmse)
    return {'loss': -rmse, 'status': STATUS_OK}


In [22]:
space = {
    'max_depth': hp.choice('max_depth', range(1, 3)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.02),
    'n_estimators': hp.choice('n_estimators', range(100, 101)),
    'subsample': hp.uniform('subsample', 0.7, 0.8),
    'gamma': hp.uniform('gamma', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}


In [23]:
mlflow.end_run()

In [24]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)
print("Best hyperparameters:", best)


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  2%|▏         | 1/50 [00:09<07:21,  9.00s/trial, best loss: -9.722181648132677]




  4%|▍         | 2/50 [00:18<07:15,  9.07s/trial, best loss: -9.722181648132677]




  6%|▌         | 3/50 [00:23<05:57,  7.61s/trial, best loss: -9.800986957011128]




  8%|▊         | 4/50 [00:30<05:22,  7.02s/trial, best loss: -9.8165301322897]  




 10%|█         | 5/50 [00:38<05:30,  7.35s/trial, best loss: -9.8165301322897]




 12%|█▏        | 6/50 [00:44<05:02,  6.88s/trial, best loss: -9.8165301322897]




 14%|█▍        | 7/50 [00:49<04:41,  6.54s/trial, best loss: -9.8165301322897]




 16%|█▌        | 8/50 [00:55<04:27,  6.36s/trial, best loss: -9.8165301322897]




 18%|█▊        | 9/50 [01:01<04:12,  6.16s/trial, best loss: -9.8165301322897]




 20%|██        | 10/50 [01:07<04:04,  6.11s/trial, best loss: -9.8165301322897]




 22%|██▏       | 11/50 [01:13<03:54,  6.02s/trial, best loss: -9.8165301322897]




 24%|██▍       | 12/50 [01:19<03:52,  6.11s/trial, best loss: -9.82966305861353]




 26%|██▌       | 13/50 [01:25<03:42,  6.01s/trial, best loss: -9.82966305861353]




 28%|██▊       | 14/50 [01:31<03:32,  5.91s/trial, best loss: -9.82966305861353]




 30%|███       | 15/50 [01:36<03:26,  5.90s/trial, best loss: -9.82966305861353]




 32%|███▏      | 16/50 [01:44<03:41,  6.51s/trial, best loss: -9.82966305861353]




 34%|███▍      | 17/50 [01:50<03:26,  6.26s/trial, best loss: -9.82966305861353]




 36%|███▌      | 18/50 [01:58<03:32,  6.63s/trial, best loss: -9.82966305861353]




 38%|███▊      | 19/50 [02:05<03:33,  6.88s/trial, best loss: -9.82966305861353]




 40%|████      | 20/50 [02:11<03:16,  6.54s/trial, best loss: -9.82966305861353]




 42%|████▏     | 21/50 [02:17<03:07,  6.48s/trial, best loss: -9.83217969657972]




 44%|████▍     | 22/50 [02:23<02:59,  6.41s/trial, best loss: -9.83217969657972]




 46%|████▌     | 23/50 [02:30<02:51,  6.35s/trial, best loss: -9.83239958800605]




 48%|████▊     | 24/50 [02:36<02:43,  6.28s/trial, best loss: -9.83239958800605]




 50%|█████     | 25/50 [02:42<02:36,  6.24s/trial, best loss: -9.83239958800605]




 50%|█████     | 25/50 [02:44<02:44,  6.57s/trial, best loss: -9.83239958800605]


KeyboardInterrupt: 

In [None]:
## Trainin model with the best hyperparamters 

In [None]:
import xgboost as xgb

# Preparing training and validation datasets
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


In [None]:
hyperparams ={'colsample_bytree':0.9320267530659203,
            'gamma':0.0976458782129943,
            'learning_rate':0.019905481508947585,
            'max_depth':2,
            'n_estimators':100,
            'subsample':0.7513506613595762}

In [None]:


with mlflow.start_run():
    mlflow.set_tag("Developer", "Huseyn")
    mlflow.log_params(hyperparams)
    booster=xgb.train(
                params=hyperparams,
                dtrain=train,
                num_boost_round=100,
                early_stopping_rounds=50,
                evals=[(valid, "validation")])
    y_pred=booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv,f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
   

[0]	validation-rmse:10.05955


Parameters: { "n_estimators" } are not used.



[1]	validation-rmse:10.02448
[2]	validation-rmse:10.01670
[3]	validation-rmse:9.98290
[4]	validation-rmse:9.97543
[5]	validation-rmse:9.96895
[6]	validation-rmse:9.93640
[7]	validation-rmse:9.93023
[8]	validation-rmse:9.89888
[9]	validation-rmse:9.89297
[10]	validation-rmse:9.88730
[11]	validation-rmse:9.88183
[12]	validation-rmse:9.85163
[13]	validation-rmse:9.82257
[14]	validation-rmse:9.79453
[15]	validation-rmse:9.76754
[16]	validation-rmse:9.74151
[17]	validation-rmse:9.73636
[18]	validation-rmse:9.73141
[19]	validation-rmse:9.70633
[20]	validation-rmse:9.68216
[21]	validation-rmse:9.67741
[22]	validation-rmse:9.67286
[23]	validation-rmse:9.64962
[24]	validation-rmse:9.62718
[25]	validation-rmse:9.62286
[26]	validation-rmse:9.60133
[27]	validation-rmse:9.58055
[28]	validation-rmse:9.56054
[29]	validation-rmse:9.55460
[30]	validation-rmse:9.55045
[31]	validation-rmse:9.54645
[32]	validation-rmse:9.52719
[33]	validation-rmse:9.52155
[34]	validation-rmse:9.50304
[35]	validation-rmse:



In [None]:
##Autolog mlfow 

In [None]:

mlflow.xgboost.autolog()
booster=xgb.train(
            params=hyperparams,
            dtrain=train,
            num_boost_round=1000,
            early_stopping_rounds=50,
            evals=[(valid, "validation")])
y_pred=booster.predict(valid)
rmse = mean_squared_error(y_val, y_pred, squared=False)
mlflow.log_metric("rmse", rmse)

Parameters: { "n_estimators" } are not used.



[0]	validation-rmse:10.05955
[1]	validation-rmse:10.02448
[2]	validation-rmse:10.01670
[3]	validation-rmse:9.98290
[4]	validation-rmse:9.97543
[5]	validation-rmse:9.96895
[6]	validation-rmse:9.93640
[7]	validation-rmse:9.93023
[8]	validation-rmse:9.89888
[9]	validation-rmse:9.89297
[10]	validation-rmse:9.88730
[11]	validation-rmse:9.88183
[12]	validation-rmse:9.85163
[13]	validation-rmse:9.82257
[14]	validation-rmse:9.79453
[15]	validation-rmse:9.76754
[16]	validation-rmse:9.74151
[17]	validation-rmse:9.73636
[18]	validation-rmse:9.73141
[19]	validation-rmse:9.70633
[20]	validation-rmse:9.68216
[21]	validation-rmse:9.67741
[22]	validation-rmse:9.67286
[23]	validation-rmse:9.64962
[24]	validation-rmse:9.62718
[25]	validation-rmse:9.62286
[26]	validation-rmse:9.60133
[27]	validation-rmse:9.58055
[28]	validation-rmse:9.56054
[29]	validation-rmse:9.55460
[30]	validation-rmse:9.55045
[31]	validation-rmse:9.54645
[32]	validation-rmse:9.52719
[33]	validation-rmse:9.52155
[34]	validation-rmse:



In [None]:
import mlflow
logged_model = 'runs:/f057b62d35924722b03678f9001d984e/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: f057b62d35924722b03678f9001d984e

In [None]:
loaded_model = mlflow.pyfunc.load_model(logged_model)
import mlflow.xgboost
xgboost_model = mlflow.xgboost.load_model(logged_model)

In [None]:
xgboost_model

<xgboost.core.Booster at 0x73f1cd82f880>

In [None]:
xgboost_model.predict(valid)

array([12.872271, 26.821747, 12.919162, ..., 12.919162, 13.479717,
       12.919162], dtype=float32)

In [None]:
stop here 

In [None]:
# Deep Learning

In [None]:

class TaxiTripModel(nn.Module):
    def __init__(self):
        super(TaxiTripModel, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        return self.output(x)

In [None]:
# Convert data to PyTorch tensors and create DataLoaders
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

In [None]:
# Train the PyTorch model
model = TaxiTripModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            preds = model(X_batch)
            loss = loss_fn(preds, y_batch.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = sum(loss_fn(model(X_val), y_val.unsqueeze(1)) for X_val, y_val in val_loader) / len(val_loader)
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

In [None]:
train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10)

# Save the trained PyTorch model and DictVectorizer
with open('models/TaxiTripModel.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

## Deploying model

In [None]:
from mlflow.tracking import MlflowClient
mlflow_tracking_uri="sqlite:///mlflow.db"
client= MlflowClient(tracking_uri=mlflow_tracking_uri)

In [None]:
import mlflow

mlflow.search_experiments ()

[<Experiment: artifact_location='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1', creation_time=1716364782674, experiment_id='1', last_update_time=1716364782674, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1716361841500, experiment_id='0', last_update_time=1716361841500, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results= 5,
)

In [None]:
for run in runs:
    if 'rmse' in run.data.metrics:
        print(f"Run ID is {run.info.run_id}, RMSE: {run.data.metrics['rmse']:.4f}")
    else:
        print(f"Run ID is {run.info.run_id}, RMSE: Not available")


Run ID is f057b62d35924722b03678f9001d984e, RMSE: 8.9794
Run ID is b151d8b09fb54022873cb16b0f9102d0, RMSE: 8.9794
Run ID is 800c1181925a4115896d68daddc19086, RMSE: 8.9794
Run ID is 972e2ddbbf6b4f4bbf3b912844899036, RMSE: Not available
Run ID is 1d050bfba2d84cab915482be291d471f, RMSE: Not available


In [None]:
import mlflow

In [None]:
run_id="f057b62d35924722b03678f9001d984e"
model_uri=f"runs:/{run_id}/model"

In [None]:
model_uri

'runs:/f057b62d35924722b03678f9001d984e/model'

In [None]:
mlflow.set_tracking_uri(mlflow_tracking_uri)

In [None]:
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-experiment")

Successfully registered model 'nyc-taxi-experiment'.
Created version '1' of model 'nyc-taxi-experiment'.


<ModelVersion: aliases=[], creation_timestamp=1716930879755, current_stage='None', description=None, last_updated_timestamp=1716930879755, name='nyc-taxi-experiment', run_id='f057b62d35924722b03678f9001d984e', run_link=None, source='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1/f057b62d35924722b03678f9001d984e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [None]:
latest_version =client.get_latest_versions(name= "nyc-taxi-experiment")

  latest_version =client.get_latest_versions(name= "nyc-taxi-experiment")


In [None]:
for version in latest_version:
    print(f'version: {version.version} , stage : {version.current_stage}')

version: 1 , stage : None


In [None]:
client.transition_model_version_stage(name="nyc-taxi-experiment",
 version=1,
 stage="Staging")

  client.transition_model_version_stage(name="nyc-taxi-experiment",


<ModelVersion: aliases=[], creation_timestamp=1716930879755, current_stage='Staging', description=None, last_updated_timestamp=1716931408409, name='nyc-taxi-experiment', run_id='f057b62d35924722b03678f9001d984e', run_link=None, source='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1/f057b62d35924722b03678f9001d984e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [None]:
client = mlflow.tracking.MlflowClient()


In [None]:
client.set_model_version_tag(
    name="nyc-taxi-experiment",
    version=1,
    key="validation_status",
    value="approved"
)

In [None]:
client.update_registered_model
(
    name = model_name,
    version =4, 
    description = f"The model version {version} was"
)

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (3400907645.py, line 3)