In [18]:
from mlflow.tracking import MlflowClient


MLFLOW_TRACKING_URI = 'sqlite:///mlflow.db'

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [3]:
client.create_experiment('my-cool-experiment')

'2'

In [9]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string='metrics.rmse < 6.31',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metrics.rmse ASC']
)

In [10]:
for run in runs:
    print(f'run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}')

run id: 73f01e2d8b37462abc0f48724a75cfc8, rmse: 6.3000
run id: 0b54d63d337c4092ad57c59d709bf358, rmse: 6.3023
run id: 93e1da1eab244a0c9e4f7f056d5eae54, rmse: 6.3033
run id: b3e46d7540754ec99a62fdcad3bfc674, rmse: 6.3056


In [19]:
import mlflow

mlflow.set_experiment(experiment_name='nyc-taxi-experiment')
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

Traceback (most recent call last):
  File "/opt/conda/envs/experiment-tracking-env/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 315, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/experiment-tracking-env/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 408, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/experiment-tracking-env/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 1336, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/experiment-tracking-env/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 1329, in _read_hel

In [21]:

run_id = '6afd6575ebab4049bc74c39824daa787'

model_uri = f'runs/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name='nyc-taxi-xgboost')

Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...
Created version '3' of model 'nyc-taxi-xgboost'.


<ModelVersion: aliases=[], creation_timestamp=1716827385293, current_stage='None', description=None, last_updated_timestamp=1716827385293, name='nyc-taxi-xgboost', run_id=None, run_link=None, source='runs/6afd6575ebab4049bc74c39824daa787/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [23]:
model_name = 'nyc-taxi-xgboost'

latest_versions = client.get_latest_versions(name=model_name)

for v in latest_versions:
     print(f'version: {v.version}, aliases: {v.aliases}')


version: 3, aliases: []


  latest_versions = client.get_latest_versions(name=model_name)


In [26]:
client.set_registered_model_alias(model_name, alias='aboba', version=3)

In [34]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, version, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{version}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [35]:
df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-03.parquet')


In [36]:
run_id = '1ab54a355c1a411e93d82c233f033b9e'
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/mlops-zoomcamp/02-experiment-tracking/preprocessor'

In [37]:
import pickle

with open('preprocessor/preprocessor.b', 'rb') as f_in:
    dv = pickle.load(f_in)

In [38]:
X_test = preprocess(df, dv)

In [39]:
target = 'duration'
y_test = df[target].values

In [41]:
%time test_model(name=model_name, version=2, X_test=X_test, y_test=y_test)



CPU times: user 16.6 s, sys: 0 ns, total: 16.6 s
Wall time: 9.46 s




{'rmse': 6.2702965482607915}

In [43]:
%time test_model(name=model_name, version=1, X_test=X_test, y_test=y_test)



CPU times: user 16.6 s, sys: 295 µs, total: 16.6 s
Wall time: 9.37 s




{'rmse': 6.2702965482607915}