In [1]:
from mlflow.tracking import MlflowClient


MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

In [3]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.search_experiments()

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp-exercises/02-experiment-tracking/mlruns/1', creation_time=1768915942617, experiment_id='1', last_update_time=1768915942617, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1768914136245, experiment_id='0', last_update_time=1768914136245, lifecycle_stage='active', name='Default', tags={}>]

In [4]:
client.create_experiment(name="my-cool-experiment")


'2'

In [5]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [6]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 8b6aaab33f7542bcbccc14319698ef83, rmse: 6.3134
run id: c84066919bb04be29c6ae6b4850438ed, rmse: 6.3138
run id: 99f4ad0d2be94dc3bf2b06459742abf2, rmse: 6.3164
run id: e14b9220b29c4804ad04f165556fbbf4, rmse: 6.3231
run id: 1febb45b6864435f90feaefceafff80b, rmse: 6.3266


In [7]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [8]:
run_id = "8b6aaab33f7542bcbccc14319698ef83" # Ο δικός σου Zebra
model_uri = f"runs:/{run_id}/models_mlflow" # Το δικό σου path
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

2026/01/23 16:33:47 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/23 16:33:47 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '2' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1769186027263, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1769186027263, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='8b6aaab33f7542bcbccc14319698ef83', run_link=None, source='models:/m-951d6e6f895c46e3802e65eb66f669ca', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error # Η νέα στάνταρ μέθοδος για RMSE
import mlflow

def read_dataframe(filename):
    # Υποστήριξη και για parquet (αφού τα data του 2021 είναι πλέον parquet)
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    else:
        df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv):
    # Προσθήκη PU_DO feature
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(dicts)

def test_model(name, stage, X_test, y_test):
    # Φόρτωση μοντέλου μέσω του Registry stage
    model_uri = f"models:/{name}/{stage}"
    model = mlflow.pyfunc.load_model(model_uri)
    y_pred = model.predict(X_test)
    
    # Χρήση της root_mean_squared_error (Scikit-learn 1.4+)
    rmse = root_mean_squared_error(y_test, y_pred)
    
    return {"rmse": rmse}

In [11]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None
version: 2, stage: Staging


  latest_versions = client.get_latest_versions(name=model_name)


In [10]:
model_version = 2
new_stage = "Staging"

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1769186027263, current_stage='Staging', deployment_job_state=None, description=None, last_updated_timestamp=1769186274143, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='8b6aaab33f7542bcbccc14319698ef83', run_link=None, source='models:/m-951d6e6f895c46e3802e65eb66f669ca', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [12]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1769186027263, current_stage='Staging', deployment_job_state=None, description='The model version 2 was transitioned to Staging on 2026-01-23', last_updated_timestamp=1769186368757, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='8b6aaab33f7542bcbccc14319698ef83', run_link=None, source='models:/m-951d6e6f895c46e3802e65eb66f669ca', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [16]:
def read_dataframe(filename):
    # Αλλαγή εδώ για parquet
    df = pd.read_parquet(filename)

    # Τα parquet έχουν ήδη σωστά datatypes, οπότε 
    # οι γραμμές με το pd.to_datetime ίσως είναι περιττές, 
    # αλλά δεν πειράζει αν τις αφήσεις.
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [17]:
df = read_dataframe("data/green_tripdata_2021-03.parquet")


In [18]:
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2021-03-01 00:05:42,2021-03-01 00:14:03,N,1.0,83,129,1.0,1.56,7.50,...,0.5,0.00,0.0,,0.3,8.80,1.0,1.0,0.0,8.350000
1,2,2021-03-01 00:21:03,2021-03-01 00:26:17,N,1.0,243,235,1.0,0.96,6.00,...,0.5,0.00,0.0,,0.3,7.30,2.0,1.0,0.0,5.233333
2,2,2021-03-01 00:02:06,2021-03-01 00:22:26,N,1.0,75,242,1.0,9.93,28.00,...,0.5,2.00,0.0,,0.3,31.30,1.0,1.0,0.0,20.333333
3,2,2021-03-01 00:24:03,2021-03-01 00:31:43,N,1.0,242,208,1.0,2.57,9.50,...,0.5,0.00,0.0,,0.3,10.80,2.0,1.0,0.0,7.666667
4,1,2021-03-01 00:11:10,2021-03-01 00:14:46,N,1.0,41,151,1.0,0.80,5.00,...,0.5,1.85,0.0,,0.3,8.15,1.0,1.0,0.0,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83822,2,2021-03-31 22:07:00,2021-03-31 22:13:00,,,41,75,,1.48,8.46,...,0.0,1.44,0.0,,0.3,10.20,,,,6.000000
83823,2,2021-03-31 22:56:00,2021-03-31 23:13:00,,,95,95,,0.09,54.25,...,0.0,0.00,0.0,,0.3,57.30,,,,17.000000
83824,2,2021-03-31 22:36:00,2021-03-31 22:45:00,,,95,95,,0.66,8.11,...,0.0,0.00,0.0,,0.3,8.41,,,,9.000000
83825,2,2021-03-31 23:35:00,2021-04-01 00:00:00,,,37,14,,9.58,36.83,...,0.0,0.00,0.0,,0.3,39.88,,,,25.000000


In [21]:
# Δες τα 5 τελευταία runs και τι artifacts έχουν
runs = client.search_runs(experiment_ids='1', max_results=5)
for r in runs:
    print(f"Run ID: {r.info.run_id}")
    arts = client.list_artifacts(r.info.run_id)
    for a in arts:
        print(f" - Artifact: {a.path}")

Run ID: 72d9ce41c68d4079b7ebad193612752c
 - Artifact: preprocessor
Run ID: 49bd0abb3329468fa68c31eeb503e21c
Run ID: 5eb20604fde34d0891a274889dba1404
 - Artifact: models_pickle
Run ID: 692e237b671a46f3b3964c2c864a433b
Run ID: 8b6aaab33f7542bcbccc14319698ef83


In [22]:
# Εδώ βάζουμε το ID που όντως έχει το artifact 'preprocessor'
correct_run_id = "72d9ce41c68d4079b7ebad193612752c"
client.download_artifacts(run_id=correct_run_id, path='preprocessor', dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/workspaces/mlops-zoomcamp-exercises/02-experiment-tracking/preprocessor'

In [23]:
import pickle

# Προσοχή στο path: αν το download_artifacts έφτιαξε φάκελο 'preprocessor'
# το αρχείο σου θα είναι μέσα εκεί. Δες το με !ls preprocessor
with open("preprocessor/preprocessor.b", "rb") as f_in: # ή σκέτο 'preprocessor.b'
    dv = pickle.load(f_in)

In [24]:
X_test = preprocess(df, dv)


In [25]:

target = "duration"
y_test = df[target].values

In [27]:
client.transition_model_version_stage(
    name="nyc-taxi-regressor",
    version=1,
    stage="Production",
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1769183719659, current_stage='Production', deployment_job_state=None, description='', last_updated_timestamp=1769187171502, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='', run_link='', source='/workspaces/mlops-zoomcamp-exercises/02-experiment-tracking/mlruns/1/models/m-e415e4f52d2a4d748d0606cfaf088048/artifacts', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [30]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)


CPU times: user 594 ms, sys: 241 µs, total: 594 ms
Wall time: 359 ms


{'rmse': 6.476283621104021}

In [31]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)


CPU times: user 7.51 s, sys: 21.9 ms, total: 7.54 s
Wall time: 5.47 s


{'rmse': 6.269308210569653}

In [33]:
client.transition_model_version_stage(
    name=model_name,
    version=2,  # <--- ΑΥΤΗ είναι η δική σου version για τον Zebra
    stage="Production",
    archive_existing_versions=True
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1769186027263, current_stage='Production', deployment_job_state=None, description='The model version 2 was transitioned to Staging on 2026-01-23', last_updated_timestamp=1769187424490, metrics=None, model_id=None, name='nyc-taxi-regressor', params=None, run_id='8b6aaab33f7542bcbccc14319698ef83', run_link=None, source='models:/m-951d6e6f895c46e3802e65eb66f669ca', status='READY', status_message=None, tags={}, user_id=None, version=2>