In [1]:
# !pip install catboost -q

In [2]:
# pip install mlflow -q

In [3]:
# !pip install lightgbm

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_extraction import DictVectorizer

import pickle
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

# from mlflow import MlflowClient
import optuna

In [8]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = 'housing-price'

In [9]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='/home/azureuser/MLOps_zoomcamp/mlruns/1', creation_time=1691845828281, experiment_id='1', last_update_time=1691845828281, lifecycle_stage='active', name='housing-price', tags={}>

In [10]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [11]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [12]:
for run in runs:
    print(f'run_id:{run.info.run_id}, rmse:{run.data.metrics["rmse"]}')

run_id:332da12b29be4a7fb4a05ce3e9e9d5ff, rmse:530561.7354094362
run_id:433821303c054110996d53ec9cabd49b, rmse:535803.966266819
run_id:b3e066b9248d4db09e13e13df9d44051, rmse:535803.966266819
run_id:6a51bd0a4d2f4912b4b710f376e03cc0, rmse:538548.3420322725
run_id:e15d499f882e4df6b0fc1d651486a009, rmse:542456.0409498757


In [13]:
run_id = "332da12b29be4a7fb4a05ce3e9e9d5ff"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="housing_price")

Registered model 'housing_price' already exists. Creating a new version of this model...
2023/08/16 13:48:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: housing_price, version 4
Created version '4' of model 'housing_price'.


<ModelVersion: aliases=[], creation_timestamp=1692190105402, current_stage='None', description=None, last_updated_timestamp=1692190105402, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [14]:
model_name = "housing_price"
latest_version = client.get_latest_versions(name=model_name)

In [15]:
latest_version[0]

<ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='', source='/home/azureuser/MLOps_zoomcamp/mlruns/1/b3de1b48f0f7480c85468b6fb837bd97/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [17]:
# client.transition_model_version_stage(
#     name=model_name,
#     version=latest_version[0].version,
#     stage="Production",
#     archive_existing_versions=False
# )

In [18]:
client.get_model_version_stages(
    name=model_name,
    version=latest_version[0].version
)

['None', 'Staging', 'Production', 'Archived']

In [19]:
registered_model = client.get_registered_model(
    name=model_name
)

In [20]:
registered_model.latest_versions

[<ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='', source='/home/azureuser/MLOps_zoomcamp/mlruns/1/b3de1b48f0f7480c85468b6fb837bd97/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>,
 <ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>,
 <ModelVersion: aliases=[], creation_timestamp=1692190105402, current_stage='None', description=None, last_updated_timestamp=1692190105402, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', 

In [21]:
for model in registered_model:
    print(model)

('aliases', {})
('creation_timestamp', 1691853852199)
('description', '')
('last_updated_timestamp', 1692190105402)
('latest_versions', [<ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='', source='/home/azureuser/MLOps_zoomcamp/mlruns/1/b3de1b48f0f7480c85468b6fb837bd97/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>, <ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>, <ModelVersion: aliases=[], creation_timestamp=1692190105402, current_s

In [22]:
data_path = 'data/Housing_dataset_train.csv'

## Modelling

In [23]:
state_to_zone = {
    "Abia": "South-East",
    "Adamawa": "North-East",
    "Akwa Ibom": "South-South",
    "Anambra": "South-East",
    "Bauchi": "North-East",
    "Bayelsa": "South-South",
    "Benue": "North-Central",
    "Borno": "North-East",
    "Cross River": "South-South",
    "Delta": "South-South",
    "Ebonyi": "South-East",
    "Edo": "South-South",
    "Ekiti": "South-West",
    "Enugu": "South-East",
    "Gombe": "North-East",
    "Imo": "South-East",
    "Jigawa": "North-West",
    "Kaduna": "North-West",
    "Kano": "North-West",
    "Katsina": "North-West",
    "Kebbi": "North-West",
    "Kogi": "North-Central",
    "Kwara": "North-Central",
    "Lagos": "South-West",
    "Nasarawa": "North-Central",
    "Niger": "North-Central",
    "Ogun": "South-West",
    "Ondo": "South-West",
    "Osun": "South-West",
    "Oyo": "South-West",
    "Plateau": "North-Central",
    "Rivers": "South-South",
    "Sokoto": "North-West",
    "Taraba": "North-East",
    "Yobe": "North-East",
    "Zamfara": "North-West",
}

In [24]:
house_type_ranks = {
    'Cottage': 1,
    'Bungalow': 2,
    'Townhouse': 3,
    'Terrace duplex': 4,
    'Detached duplex': 5,
    'Semi-detached duplex': 6,
    'Flat': 7,
    'Penthouse': 8,
    'Apartment': 9,
    'Mansion': 10
}

In [25]:
def preprocess(data_path):
    data = pd.read_csv(data_path)

    print(data.columns.tolist())
    
    data['zone'] = data['loc'].map(state_to_zone)
    data['title'] = data['title'].map(house_type_ranks)

    category_frequencies = data['loc'].value_counts(normalize=True)
    loc_frequency_mapping = category_frequencies.to_dict()
    data['loc'] = data['loc'].map(loc_frequency_mapping)

    data['rooms'] = data['bathroom'] + data['bedroom']
    data['bathroom_ratio'] = data['bathroom']/(data['bathroom'] + data['bedroom'])

    data['zone'] = data['zone'].astype('category').cat.codes

    print("_____________________________________________________________________________")
    print(data.head())

    X = data.drop(columns=['price'], axis=0)
    y = data.price

    return X, y



In [26]:
X_, y_ = preprocess(data_path)

['ID', 'loc', 'title', 'bedroom', 'bathroom', 'parking_space', 'price']
_____________________________________________________________________________
      ID       loc  title  bedroom  bathroom  parking_space        price  \
0   3583  0.028309    6.0      2.0       2.0            1.0  1149999.565   
1   2748  0.028227    9.0      NaN       2.0            4.0  1672416.689   
2   9261  0.027570    NaN      7.0       5.0            NaN  3364799.814   
3   2224  0.029786    5.0      5.0       2.0            4.0  2410306.756   
4  10300  0.026340    4.0      NaN       5.0            6.0  2600700.898   

   zone  rooms  bathroom_ratio  
0     2    4.0        0.500000  
1     5    NaN             NaN  
2     5   12.0        0.416667  
3     3    7.0        0.285714  
4     0    NaN             NaN  


In [27]:
X_dicts = X_.to_dict(orient='records')

In [28]:
dv = DictVectorizer()


In [29]:
mlflow.lightgbm.autolog(disable=True)

In [35]:
with mlflow.start_run():

    params = {
        'max_depth': 10,
        'n_estimators': 2000,
        'learning_rate': 0.002712819361612371,
        'colsample_bytree': 0.9484547548287134,
        'subsample': 0.8490126211976283
        }

    mlflow.log_params(params)

    fold_pred = []
    splits = 2
    fold = KFold(n_splits=splits)

    for data_index, test_index in fold.split(X_, y_):
        X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
        y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

        model = LGBMRegressor(**params, objective='rmse')
        model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
        model_preds = model.predict(X_test)

        rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
        print(f'err: {rmse}')
        fold_pred.append(rmse)

    RMSE = np.mean(fold_pred)

    mlflow.log_param("splits", splits)
    mlflow.log_metric("rmse", RMSE)
    
    with open('models/lgb.bin', 'wb') as f:
        pickle.dump(model, f)

    mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
    mlflow.lightgbm.log_model(model, artifact_path="models_mlflow")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 9
[LightGBM] [Info] Start training from score 1425.058519


err: 529324.2298639654
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 9
[LightGBM] [Info] Start training from score 1423.011279
err: 571620.4886453262


In [41]:
with mlflow.start_run():

    params = {
        'max_depth': 10,
        'n_estimators': 2000,
        'subsample': 0.84,
        'learning_rate': 0.01,
        'n_estimators' : 2000
        }

    mlflow.log_params(params)

    fold_pred_1 = []
    splits = 10
    fold = KFold(n_splits=splits)

    for data_index, test_index in fold.split(X_, y_):
        X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
        y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

        model = CatBoostRegressor(**params)
        model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)], verbose=0)
        model_preds = model.predict(X_test)

        rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
        print(f'err: {rmse}')
        fold_pred_1.append(rmse)

    RMSE = np.mean(fold_pred_1)

    mlflow.log_param("splits", splits)
    mlflow.log_metric("rmse", RMSE)
    
    with open('models/cat.bin', 'wb') as f:
        pickle.dump(model, f)

    mlflow.log_artifact(local_path="models/cat.bin", artifact_path="models_pickle")
    mlflow.catboost.log_model(model, artifact_path="models_mlflow")

err: 466743.0169847345
err: 524069.56855255977
err: 638941.7498937332
err: 470119.679511906
err: 476696.0579938225
err: 588413.1371192657
err: 552106.7342063364
err: 494535.4359102119
err: 513112.49038561864
err: 633301.7921100028


## Model Registry

In [9]:
client.list_experiments()

AttributeError: 'MlflowClient' object has no attribute 'list_experiments'

# Hyperparameter Tuning

In [None]:
import optuna

In [None]:
# def objective(trial):

#     max_depth = trial.suggest_int('rf_max_depth', 2, 32)
#     n_estimators = trial.suggest_int('n_estimators', 100, 4000)
#     learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
#     colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
#     subsample = trial.suggest_float('subsample', 0, 1)

#     params = {
#         'max_depth':max_depth,
#         'colsample_bytree': colsample_bytree,
#         'learning_rate': learning_rate,
#         'n_estimators': n_estimators,
#         'subsample': subsample,
#     }

#     X_data, X_val, y_data, y_val = data_test_split(X, y, random_state=RANDOM_STATE)

#     LGB = CatBoostRegressor(**params)
#     LGB.fit(X_data, y_data)
#     y_pred = LGB.predict(X_val)

#     error = mean_squared_error(y_val, y_pred, squared=False)

#     return error  # An objective value linked with the Trial object.

#  # Invoke optimization of the objective function.

In [33]:
with mlflow.start_run():

    params = {
        'max_depth': 10,
        'n_estimators': 2000,
        'learning_rate': 0.002712819361612371,
        'colsample_bytree': 0.9484547548287134,
        'subsample': 0.8490126211976283
        }

    mlflow.log_params(params)

    fold_pred = []
    splits = 2
    fold = KFold(n_splits=splits)

    for data_index, test_index in fold.split(X_, y_):
        X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
        y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

        model = LGBMRegressor(**params, objective='rmse')
        model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
        model_preds = model.predict(X_test)

        rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
        print(f'err: {rmse}')
        fold_pred.append(rmse)

    RMSE = np.mean(fold_pred)

    mlflow.log_param("splits", splits)
    mlflow.log_metric("rmse", RMSE)
    
    with open('models/lgb.bin', 'wb') as f:
        pickle.dump(model, f)

    # mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
    mlflow.lightgbm.log_model(model, artifact_path="models_mlflow")

[1]	training's rmse: 335.771	valid_1's rmse: 2.38095e+06
[2]	training's rmse: 335.171	valid_1's rmse: 2.38095e+06
[3]	training's rmse: 334.573	valid_1's rmse: 2.38095e+06
[4]	training's rmse: 333.971	valid_1's rmse: 2.38095e+06
[5]	training's rmse: 333.378	valid_1's rmse: 2.38095e+06
[6]	training's rmse: 332.781	valid_1's rmse: 2.38095e+06
[7]	training's rmse: 332.187	valid_1's rmse: 2.38095e+06
[8]	training's rmse: 331.595	valid_1's rmse: 2.38095e+06
[9]	training's rmse: 331.004	valid_1's rmse: 2.38095e+06
[10]	training's rmse: 330.416	valid_1's rmse: 2.38095e+06
[11]	training's rmse: 329.831	valid_1's rmse: 2.38095e+06
[12]	training's rmse: 329.246	valid_1's rmse: 2.38095e+06
[13]	training's rmse: 328.665	valid_1's rmse: 2.38095e+06
[14]	training's rmse: 328.085	valid_1's rmse: 2.38095e+06
[15]	training's rmse: 327.507	valid_1's rmse: 2.38095e+06
[16]	training's rmse: 326.932	valid_1's rmse: 2.38095e+06
[17]	training's rmse: 326.359	valid_1's rmse: 2.38095e+06
[18]	training's rmse: 3

PermissionError: [Errno 13] Permission denied: '/home/azureuser'

In [30]:
def objective(trial):

    with mlflow.start_run():

        max_depth = trial.suggest_int('rf_max_depth', 2, 16)
        n_estimators = trial.suggest_int('n_estimators', 100, 4000)
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
        subsample = trial.suggest_float('subsample', 0, 1)

        params = {
            'max_depth':max_depth,
            'colsample_bytree': colsample_bytree,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'subsample': subsample,
        }

        mlflow.log_params(params)

        fold_pred = []
        splits = 5
        fold = KFold(n_splits=splits)

        for data_index, test_index in fold.split(X_, y_):
            X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
            y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

            model = LGBMRegressor(**params, objective='rmse')
            model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
            model_preds = model.predict(X_test)

            rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
            print(f'err: {rmse}')
            fold_pred.append(rmse)

        RMSE = np.mean(fold_pred)

        mlflow.log_param("splits", splits)
        mlflow.log_metric("rmse", RMSE)
        
        with open('models/lgb.bin', 'wb') as f:
            pickle.dump(model, f)

        mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
        mlflow.lightgbm.log_model(model, artifact_path="models_mlflow")

    return RMSE  # An objective value linked with the Trial object.

In [31]:
study = optuna.create_study(direction='minimize')  # Create a new study.
study.optimize(objective, n_trials=10)

[I 2023-08-16 13:55:25,270] A new study created in memory with name: no-name-0c51e86a-9655-48e4-94be-0b7ee8581761


[1]	training's rmse: 332.504	valid_1's rmse: 2.39469e+06
[2]	training's rmse: 332.307	valid_1's rmse: 2.39469e+06
[3]	training's rmse: 332.022	valid_1's rmse: 2.39469e+06
[4]	training's rmse: 331.737	valid_1's rmse: 2.39469e+06
[5]	training's rmse: 331.467	valid_1's rmse: 2.39469e+06
[6]	training's rmse: 331.183	valid_1's rmse: 2.39469e+06
[7]	training's rmse: 331.011	valid_1's rmse: 2.39469e+06
[8]	training's rmse: 330.728	valid_1's rmse: 2.39469e+06
[9]	training's rmse: 330.46	valid_1's rmse: 2.39469e+06
[10]	training's rmse: 330.192	valid_1's rmse: 2.39469e+06
[11]	training's rmse: 329.926	valid_1's rmse: 2.39469e+06
[12]	training's rmse: 329.659	valid_1's rmse: 2.39469e+06
[13]	training's rmse: 329.486	valid_1's rmse: 2.39469e+06
[14]	training's rmse: 329.24	valid_1's rmse: 2.39469e+06
[15]	training's rmse: 328.961	valid_1's rmse: 2.39469e+06
[16]	training's rmse: 328.716	valid_1's rmse: 2.39469e+06
[17]	training's rmse: 328.439	valid_1's rmse: 2.39469e+06
[18]	training's rmse: 328

[W 2023-08-16 13:55:55,209] Trial 0 failed with parameters: {'rf_max_depth': 3, 'n_estimators': 2602, 'learning_rate': 0.0016598694139360833, 'colsample_bytree': 0.6681739879195017, 'subsample': 0.5066062445982322} because of the following error: PermissionError(13, 'Permission denied').
Traceback (most recent call last):
  File "/home/gbotemi/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_16754/1676777337.py", line 45, in objective
    mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
  File "/home/gbotemi/.local/lib/python3.10/site-packages/mlflow/tracking/fluent.py", line 877, in log_artifact
    MlflowClient().log_artifact(run_id, local_path, artifact_path)
  File "/home/gbotemi/.local/lib/python3.10/site-packages/mlflow/tracking/client.py", line 1091, in log_artifact
    self._tracking_client.log_artifact(run_id, local_path, artifact_path)
  File "/home/g

PermissionError: [Errno 13] Permission denied: '/home/azureuser'

In [None]:
study.best_trial

FrozenTrial(number=95, state=TrialState.COMPLETE, values=[551678.5994224877], datetime_start=datetime.datetime(2023, 8, 6, 14, 17, 1, 888573), datetime_complete=datetime.datetime(2023, 8, 6, 14, 17, 7, 916277), params={'rf_max_depth': 26, 'n_estimators': 3679, 'learning_rate': 0.002712819361612371, 'colsample_bytree': 0.9484547548287134, 'subsample': 0.8490126211976283}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'rf_max_depth': IntDistribution(high=32, log=False, low=2, step=1), 'n_estimators': IntDistribution(high=4000, log=False, low=100, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=1e-05, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=95, value=None)

In [None]:
trial = study.best_trial

In [None]:
trial.value

551678.5994224877

In [None]:
trial.params

{'rf_max_depth': 26,
 'n_estimators': 3679,
 'learning_rate': 0.002712819361612371,
 'colsample_bytree': 0.9484547548287134,
 'subsample': 0.8490126211976283}

In [41]:
housing_details = {
    "ID": 343,
    "loc": "lagos",
    "title": "Mansion",
    "bedroom": 2.0,
    "bathroom": 1.0,
    "parking_space" : 2.0
}


In [42]:
x = pd.DataFrame([housing_details])

In [43]:
from prepare_features import prepare

In [44]:
prepare(x)

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,zone,rooms,bathroom_ratio
0,343,1.0,10,2.0,1.0,2.0,-1,3.0,0.333333


In [45]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = 'housing-price'

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

model_registry_name = "housing_price"

client = MlflowClient()
prod_model = client.get_latest_versions(model_registry_name, stages=["Production"])[0]

In [46]:
prod_model

<ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [47]:
prod_model.version

2

In [48]:
run_id = prod_model.run_id
run_id

'332da12b29be4a7fb4a05ce3e9e9d5ff'

In [49]:
logged_model = f'runs:/{run_id}/model'
logged_model

'runs:/332da12b29be4a7fb4a05ce3e9e9d5ff/model'

In [50]:
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [58]:
float(loaded_model.predict(prepare(x))[0])

1246.8361280347074

In [21]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.lightgbm
  run_id: 332da12b29be4a7fb4a05ce3e9e9d5ff

In [None]:
model_name = "housing_price"
latest_version = client.get_latest_versions(name=model_name)


latest_version[0].version

client.transition_model_version_stage(
    name=model_name,
    version=latest_version[0].version,
    stage="Staging",
    archive_existing_versions=False
)

In [93]:
client.search_model_versions()

[<ModelVersion: aliases=[], creation_timestamp=1691952653200, current_stage='None', description=None, last_updated_timestamp=1691952653200, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>,
 <ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>,
 <ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_li

In [66]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)

In [75]:
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)[0]

In [76]:
for run in runs:
    print(f'run_id:{run.info.run_id}, rmse:{run.data.metrics["rmse"]}')

AttributeError: 'tuple' object has no attribute 'info'

In [81]:
best_run_id = runs.info.run_id
best_run_id

'332da12b29be4a7fb4a05ce3e9e9d5ff'

In [82]:
best_run_metric = runs.data.metrics["rmse"]
best_run_metric

530561.7354094362

In [83]:
prod_model = client.get_latest_versions(model_registry_name, stages=["Production"])[0]
prod_model_run_id = prod_model.run_id
prod_model_run_id

'332da12b29be4a7fb4a05ce3e9e9d5ff'

In [84]:
prod_model

<ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [89]:
client.get_run(run_id=prod_model_run_id).data.metrics["rmse"]

530561.7354094362