In [1]:
# !pip install catboost -q

In [2]:
# pip install mlflow -q

In [3]:
# !pip install lightgbm

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_extraction import DictVectorizer

import pickle
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

# from mlflow import MlflowClient
import optuna

In [5]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = 'housing-price'

In [6]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='/home/azureuser/MLOps_zoomcamp/mlruns/1', creation_time=1691845828281, experiment_id='1', last_update_time=1691845828281, lifecycle_stage='active', name='housing-price', tags={}>

In [7]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [8]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [9]:
for run in runs:
    print(f'run_id:{run.info.run_id}, rmse:{run.data.metrics["rmse"]}')

run_id:289a7bfd5fb34d3e99f9177259853741, rmse:511932.5641417451
run_id:d5e764b7c11a47909f54d6381d2f2718, rmse:513014.6579161583
run_id:aafb1d5434d5498e9cfd01575f050d5c, rmse:515120.72860757157
run_id:ed6dfd34401941dca87f607eaa2478c1, rmse:515166.62504973076
run_id:9bfb782e3c9e4b1d93a0d61f75998d79, rmse:515423.8397249803


In [7]:
run_id = "332da12b29be4a7fb4a05ce3e9e9d5ff"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="housing_price")

Registered model 'housing_price' already exists. Creating a new version of this model...
2023/08/16 13:07:59 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: housing_price, version 4
Created version '4' of model 'housing_price'.


<ModelVersion: aliases=[], creation_timestamp=1692191279513, current_stage='None', description=None, last_updated_timestamp=1692191279513, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [14]:
model_name = "housing_price"
latest_version = client.get_latest_versions(name=model_name)

In [15]:
latest_version[0]

<ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='', source='/home/azureuser/MLOps_zoomcamp/mlruns/1/b3de1b48f0f7480c85468b6fb837bd97/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [17]:
# client.transition_model_version_stage(
#     name=model_name,
#     version=latest_version[0].version,
#     stage="Production",
#     archive_existing_versions=False
# )

In [18]:
client.get_model_version_stages(
    name=model_name,
    version=latest_version[0].version
)

['None', 'Staging', 'Production', 'Archived']

In [19]:
registered_model = client.get_registered_model(
    name=model_name
)

In [20]:
registered_model.latest_versions

[<ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='', source='/home/azureuser/MLOps_zoomcamp/mlruns/1/b3de1b48f0f7480c85468b6fb837bd97/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>,
 <ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>,
 <ModelVersion: aliases=[], creation_timestamp=1692190105402, current_stage='None', description=None, last_updated_timestamp=1692190105402, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', 

In [21]:
for model in registered_model:
    print(model)

('aliases', {})
('creation_timestamp', 1691853852199)
('description', '')
('last_updated_timestamp', 1692190105402)
('latest_versions', [<ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='', source='/home/azureuser/MLOps_zoomcamp/mlruns/1/b3de1b48f0f7480c85468b6fb837bd97/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>, <ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>, <ModelVersion: aliases=[], creation_timestamp=1692190105402, current_s

In [10]:
data_path = 'data/Housing_dataset_train.csv'

In [11]:
df = pd.read_csv(data_path)

In [12]:
X = df.drop(columns=['price'], axis=0)
y = df.price

In [13]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2)

In [14]:
y_test

1715     1.313872e+06
10221    8.243404e+05
13572    5.041279e+06
4356     3.593519e+06
4458     4.304868e+06
             ...     
10219    9.495268e+05
3157     1.784090e+06
10101    8.965089e+05
10871    1.239437e+06
1238     1.976153e+06
Name: price, Length: 2800, dtype: float64

In [15]:
pd.merge(X_test, y_test, left_index=True, right_index=True, how='inner')

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price
1715,669,Cross River,Apartment,2.0,1.0,,1.313872e+06
10221,1164,Borno,Cottage,3.0,1.0,3.0,8.243404e+05
13572,11666,Niger,,9.0,6.0,6.0,5.041279e+06
4356,9278,Rivers,Semi-detached duplex,7.0,7.0,,3.593519e+06
4458,7545,Cross River,Penthouse,9.0,1.0,,4.304868e+06
...,...,...,...,...,...,...,...
10219,215,,,1.0,1.0,3.0,9.495268e+05
3157,689,Osun,Penthouse,2.0,,2.0,1.784090e+06
10101,2363,Gombe,Bungalow,4.0,1.0,3.0,8.965089e+05
10871,12806,Imo,Flat,2.0,,4.0,1.239437e+06


## Modelling

In [16]:
state_to_zone = {
    "Abia": "South-East",
    "Adamawa": "North-East",
    "Akwa Ibom": "South-South",
    "Anambra": "South-East",
    "Bauchi": "North-East",
    "Bayelsa": "South-South",
    "Benue": "North-Central",
    "Borno": "North-East",
    "Cross River": "South-South",
    "Delta": "South-South",
    "Ebonyi": "South-East",
    "Edo": "South-South",
    "Ekiti": "South-West",
    "Enugu": "South-East",
    "Gombe": "North-East",
    "Imo": "South-East",
    "Jigawa": "North-West",
    "Kaduna": "North-West",
    "Kano": "North-West",
    "Katsina": "North-West",
    "Kebbi": "North-West",
    "Kogi": "North-Central",
    "Kwara": "North-Central",
    "Lagos": "South-West",
    "Nasarawa": "North-Central",
    "Niger": "North-Central",
    "Ogun": "South-West",
    "Ondo": "South-West",
    "Osun": "South-West",
    "Oyo": "South-West",
    "Plateau": "North-Central",
    "Rivers": "South-South",
    "Sokoto": "North-West",
    "Taraba": "North-East",
    "Yobe": "North-East",
    "Zamfara": "North-West",
}

In [17]:
house_type_ranks = {
    'Cottage': 1,
    'Bungalow': 2,
    'Townhouse': 3,
    'Terrace duplex': 4,
    'Detached duplex': 5,
    'Semi-detached duplex': 6,
    'Flat': 7,
    'Penthouse': 8,
    'Apartment': 9,
    'Mansion': 10
}

In [18]:
def preprocess(data_path):
    data = pd.read_csv(data_path)

    print(data.columns.tolist())
    
    data['zone'] = data['loc'].map(state_to_zone)
    data['title'] = data['title'].map(house_type_ranks)

    category_frequencies = data['loc'].value_counts(normalize=True)
    loc_frequency_mapping = category_frequencies.to_dict()
    data['loc'] = data['loc'].map(loc_frequency_mapping)

    data['rooms'] = data['bathroom'] + data['bedroom']
    data['bathroom_ratio'] = data['bathroom']/(data['bathroom'] + data['bedroom'])

    data['zone'] = data['zone'].astype('category').cat.codes

    print("_____________________________________________________________________________")
    print(data.head())

    X = data.drop(columns=['price'], axis=0)
    y = data.price

    return X, y



In [19]:
X_, y_ = preprocess(data_path)

['ID', 'loc', 'title', 'bedroom', 'bathroom', 'parking_space', 'price']
_____________________________________________________________________________
      ID       loc  title  bedroom  bathroom  parking_space        price  \
0   3583  0.028309    6.0      2.0       2.0            1.0  1149999.565   
1   2748  0.028227    9.0      NaN       2.0            4.0  1672416.689   
2   9261  0.027570    NaN      7.0       5.0            NaN  3364799.814   
3   2224  0.029786    5.0      5.0       2.0            4.0  2410306.756   
4  10300  0.026340    4.0      NaN       5.0            6.0  2600700.898   

   zone  rooms  bathroom_ratio  
0     2    4.0        0.500000  
1     5    NaN             NaN  
2     5   12.0        0.416667  
3     3    7.0        0.285714  
4     0    NaN             NaN  


In [20]:
X_dicts = X_.to_dict(orient='records')

In [21]:
dv = DictVectorizer()


In [22]:
mlflow.lightgbm.autolog(disable=True)

In [35]:
with mlflow.start_run():

    params = {
        'max_depth': 10,
        'n_estimators': 2000,
        'learning_rate': 0.002712819361612371,
        'colsample_bytree': 0.9484547548287134,
        'subsample': 0.8490126211976283
        }

    mlflow.log_params(params)

    fold_pred = []
    splits = 2
    fold = KFold(n_splits=splits)

    for data_index, test_index in fold.split(X_, y_):
        X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
        y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

        model = LGBMRegressor(**params, objective='rmse')
        model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
        model_preds = model.predict(X_test)

        rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
        print(f'err: {rmse}')
        fold_pred.append(rmse)

    RMSE = np.mean(fold_pred)

    mlflow.log_param("splits", splits)
    mlflow.log_metric("rmse", RMSE)
    
    with open('models/lgb.bin', 'wb') as f:
        pickle.dump(model, f)

    mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
    mlflow.lightgbm.log_model(model, artifact_path="models_mlflow")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 9
[LightGBM] [Info] Start training from score 1425.058519


err: 529324.2298639654
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 9
[LightGBM] [Info] Start training from score 1423.011279
err: 571620.4886453262


In [41]:
with mlflow.start_run():

    params = {
        'max_depth': 10,
        'n_estimators': 2000,
        'subsample': 0.84,
        'learning_rate': 0.01,
        'n_estimators' : 2000
        }

    mlflow.log_params(params)

    fold_pred_1 = []
    splits = 10
    fold = KFold(n_splits=splits)

    for data_index, test_index in fold.split(X_, y_):
        X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
        y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

        model = CatBoostRegressor(**params)
        model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)], verbose=0)
        model_preds = model.predict(X_test)

        rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
        print(f'err: {rmse}')
        fold_pred_1.append(rmse)

    RMSE = np.mean(fold_pred_1)

    mlflow.log_param("splits", splits)
    mlflow.log_metric("rmse", RMSE)
    
    with open('models/cat.bin', 'wb') as f:
        pickle.dump(model, f)

    mlflow.log_artifact(local_path="models/cat.bin", artifact_path="models_pickle")
    mlflow.catboost.log_model(model, artifact_path="models_mlflow")

err: 466743.0169847345
err: 524069.56855255977
err: 638941.7498937332
err: 470119.679511906
err: 476696.0579938225
err: 588413.1371192657
err: 552106.7342063364
err: 494535.4359102119
err: 513112.49038561864
err: 633301.7921100028


## Model Registry

In [9]:
client.list_experiments()

AttributeError: 'MlflowClient' object has no attribute 'list_experiments'

# Hyperparameter Tuning

In [None]:
import optuna

In [None]:
# def objective(trial):

#     max_depth = trial.suggest_int('rf_max_depth', 2, 32)
#     n_estimators = trial.suggest_int('n_estimators', 100, 4000)
#     learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
#     colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
#     subsample = trial.suggest_float('subsample', 0, 1)

#     params = {
#         'max_depth':max_depth,
#         'colsample_bytree': colsample_bytree,
#         'learning_rate': learning_rate,
#         'n_estimators': n_estimators,
#         'subsample': subsample,
#     }

#     X_data, X_val, y_data, y_val = data_test_split(X, y, random_state=RANDOM_STATE)

#     LGB = CatBoostRegressor(**params)
#     LGB.fit(X_data, y_data)
#     y_pred = LGB.predict(X_val)

#     error = mean_squared_error(y_val, y_pred, squared=False)

#     return error  # An objective value linked with the Trial object.

#  # Invoke optimization of the objective function.

In [8]:
# with mlflow.start_run():

#     params = {
#         'max_depth': 10,
#         'n_estimators': 2000,
#         'learning_rate': 0.002712819361612371,
#         'colsample_bytree': 0.9484547548287134,
#         'subsample': 0.8490126211976283
#         }

#     mlflow.log_params(params)

#     fold_pred = []
#     splits = 2
#     fold = KFold(n_splits=splits)

#     for data_index, test_index in fold.split(X_, y_):
#         X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
#         y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

#         model = LGBMRegressor(**params, objective='rmse')
#         model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
#         model_preds = model.predict(X_test)

#         rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
#         print(f'err: {rmse}')
#         fold_pred.append(rmse)

#     RMSE = np.mean(fold_pred)

#     mlflow.log_param("splits", splits)
#     mlflow.log_metric("rmse", RMSE)
    
#     with open('models/lgb.bin', 'wb') as f:
#         pickle.dump(model, f)

#     # mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
#     mlflow.lightgbm.log_model(model, artifact_path="models_mlflow")

In [23]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "lgboost")
        mlflow.log_params(params)
        
        fold_pred = []
        splits = 5
        fold = KFold(n_splits=splits)

        for data_index, test_index in fold.split(X_, y_):
            X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
            y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

            model = LGBMRegressor(**params, objective='rmse')
            model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
            model_preds = model.predict(X_test)

            rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
            print(f'err: {rmse}')
            fold_pred.append(rmse)

        RMSE = np.mean(fold_pred)

        mlflow.log_param("splits", splits)
        mlflow.log_metric("rmse", RMSE)

    return RMSE

In [24]:
max_depth = trial.suggest_int('rf_max_depth', 2, 16)
n_estimators = trial.suggest_int('n_estimators', 100, 4000)
learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
subsample = trial.suggest_float('subsample', 0, 1)

params = {
    'max_depth':max_depth,
    'colsample_bytree': colsample_bytree,
    'learning_rate': learning_rate,
    'n_estimators': n_estimators,
    'subsample': subsample,
}

NameError: name 'trial' is not defined

In [25]:
def objective(trial):

    with mlflow.start_run():

        max_depth = trial.suggest_int('rf_max_depth', 2, 16)
        n_estimators = trial.suggest_int('n_estimators', 100, 4000)
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
        subsample = trial.suggest_float('subsample', 0, 1)

        params = {
            'max_depth':max_depth,
            'colsample_bytree': colsample_bytree,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'subsample': subsample,
        }

        mlflow.log_params(params)

        fold_pred = []
        splits = 5
        fold = KFold(n_splits=splits)

        for data_index, test_index in fold.split(X_, y_):
            X_data, X_test = X_.iloc[data_index], X_.iloc[test_index]
            y_data, y_test = np.sqrt(y_.iloc[data_index]), y_.iloc[test_index]

            model = LGBMRegressor(**params, objective='rmse')
            model.fit(X_data, y_data, eval_set=[(X_data, y_data), (X_test, y_test)])
            model_preds = model.predict(X_test)

            rmse = mean_squared_error(y_test, np.square(model_preds), squared=False)
            print(f'err: {rmse}')
            fold_pred.append(rmse)

        RMSE = np.mean(fold_pred)

        mlflow.log_param("splits", splits)
        mlflow.log_metric("rmse", RMSE)
        
        with open('models/lgb.bin', 'wb') as f:
            pickle.dump(model, f)

        mlflow.log_artifact(local_path="models/lgb.bin", artifact_path="models_pickle")
        mlflow.lightgbm.log_model(model, artifact_path="models_mlflow")

    return RMSE  # An objective value linked with the Trial object.

In [26]:
study = optuna.create_study(direction='minimize')  # Create a new study.
study.optimize(objective, n_trials=10)

[I 2023-08-21 12:32:26,198] A new study created in memory with name: no-name-cdf7e8ef-2c7c-40bc-9405-0cca64d5e09e


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 995879.5173730514
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1002478.3572151061
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 991242.5808691392
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start traini

[I 2023-08-21 12:32:42,818] Trial 0 finished with value: 999640.0664854965 and parameters: {'rf_max_depth': 10, 'n_estimators': 1284, 'learning_rate': 0.00015295561820655756, 'colsample_bytree': 0.6289654466753144, 'subsample': 0.7266524465234102}. Best is trial 0 with value: 999640.0664854965.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 1063371.7517184794
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1068647.7429515899
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 1058861.039896482
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start train

[I 2023-08-21 12:33:25,799] Trial 1 finished with value: 1066429.6293254145 and parameters: {'rf_max_depth': 16, 'n_estimators': 3641, 'learning_rate': 1.1689290611641335e-05, 'colsample_bytree': 0.7593772955524063, 'subsample': 0.3201467610332336}. Best is trial 0 with value: 999640.0664854965.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 606293.2555713283
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 635586.6446375854
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 608651.7272673202
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start trainin

[I 2023-08-21 12:34:09,691] Trial 2 finished with value: 626158.8801000996 and parameters: {'rf_max_depth': 10, 'n_estimators': 3867, 'learning_rate': 0.0005446742835420265, 'colsample_bytree': 0.6953907303147544, 'subsample': 0.5478418730971762}. Best is trial 2 with value: 626158.8801000996.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 1078085.1082066805
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1082356.8367775546
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 1073298.7671595083
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start trai

[I 2023-08-21 12:34:16,727] Trial 3 finished with value: 1080675.834401279 and parameters: {'rf_max_depth': 3, 'n_estimators': 552, 'learning_rate': 7.149335168432028e-05, 'colsample_bytree': 0.23861554961383014, 'subsample': 0.5204752093673649}. Best is trial 2 with value: 626158.8801000996.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 1070026.076798823
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1075433.7991277804
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 1065721.3372176327
You can set `force_r

[I 2023-08-21 12:34:28,642] Trial 4 finished with value: 1073208.6784717583 and parameters: {'rf_max_depth': 4, 'n_estimators': 874, 'learning_rate': 3.315093396827668e-05, 'colsample_bytree': 0.9567973820655036, 'subsample': 0.5681274424498042}. Best is trial 2 with value: 626158.8801000996.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 532018.5881969255
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 564689.7309207144
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 538321.462629652
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training

[I 2023-08-21 12:34:54,239] Trial 5 finished with value: 553496.8006874073 and parameters: {'rf_max_depth': 6, 'n_estimators': 3498, 'learning_rate': 0.010977018891337974, 'colsample_bytree': 0.06068828486605282, 'subsample': 0.44624487607471774}. Best is trial 5 with value: 553496.8006874073.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 502220.2860718648
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 561737.1771291043
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 508540.3458056217
You can set `force_row

[I 2023-08-21 12:35:21,318] Trial 6 finished with value: 535102.5659365132 and parameters: {'rf_max_depth': 11, 'n_estimators': 2062, 'learning_rate': 0.011580191656404183, 'colsample_bytree': 0.8585992596560469, 'subsample': 0.25712758370429256}. Best is trial 6 with value: 535102.5659365132.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 1032059.500762419
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1038823.7078971636
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 1027801.3516592025
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

[I 2023-08-21 12:36:01,243] Trial 7 finished with value: 1035759.3385744098 and parameters: {'rf_max_depth': 11, 'n_estimators': 2545, 'learning_rate': 3.8392320773774175e-05, 'colsample_bytree': 0.8457769474836591, 'subsample': 0.269130673074513}. Best is trial 6 with value: 535102.5659365132.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 1073211.7701665354
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1078002.9723134502
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 1068607.3993571284
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total B

[I 2023-08-21 12:36:23,197] Trial 8 finished with value: 1076056.1791491085 and parameters: {'rf_max_depth': 8, 'n_estimators': 1363, 'learning_rate': 2.088262038531234e-05, 'colsample_bytree': 0.512851859532893, 'subsample': 0.07518794723752475}. Best is trial 6 with value: 535102.5659365132.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.804020
err: 1075910.7377297597
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1423.806812
err: 1080453.9331831643
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start training from score 1425.268171
err: 1071169.0168984798
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 11200, number of used features: 9
[LightGBM] [Info] Start trai

[I 2023-08-21 12:36:51,249] Trial 9 finished with value: 1078619.9698041747 and parameters: {'rf_max_depth': 5, 'n_estimators': 3751, 'learning_rate': 1.946594026297741e-05, 'colsample_bytree': 0.07118161174081294, 'subsample': 0.7288067586485617}. Best is trial 6 with value: 535102.5659365132.


In [22]:
trial = study.best_trial

In [23]:
trial.value

533861.2215643647

In [24]:
trial.params

{'rf_max_depth': 6,
 'n_estimators': 3914,
 'learning_rate': 0.014025985859548822,
 'colsample_bytree': 0.21383255211038665,
 'subsample': 0.3887821186692507}

In [43]:
runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=10,
        # order_by=["metrics.rmse"]
    )

In [49]:
best_run = float('inf')
run_id = None
for run in runs:
    print(run.info.run_id, run.data.metrics["rmse"])
    if run.data.metrics["rmse"] < best_run:
        best_run = run.data.metrics["rmse"]
        run_id = run.info.run_id
best_run, run_id

d1cecb4a90e549b0b464757cdc6d8fb9 911095.5193719994
f32f96a98ed84446be15a4a93017c66f 1049363.0640975821
48096c6d543e475d91523345274140c4 1072370.9909723853
58dd1fdb99ce4f26b9b60fe02129a74a 533861.2215643647
c9ed38cab16f465aa47d590fe1abb0b9 960717.9550722272
5e8b8ef4976643f7952b1697f212d68a 1044373.7589869536
976b4f32b5fa4f77aa62d9f6862af98d 1071792.6109700385
5b63e490a3a04b6b88f663d6ecaab255 1040556.517829923
2d83141fdc604498bc9337bda0fd7083 540141.3350229821
3d1a37e45e7b48798cdc1a923c2021ae 583052.301024107


(533861.2215643647, '58dd1fdb99ce4f26b9b60fe02129a74a')

In [53]:
model_registry_name = "housing_price"
version_name = 1

In [61]:
mlflow.register_model(
    model_uri=f"runs:/{run_id}/models",
    name=model_registry_name
)

Registered model 'housing_price' already exists. Creating a new version of this model...
2023/08/16 14:00:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: housing_price, version 6
Created version '6' of model 'housing_price'.


<ModelVersion: aliases=[], creation_timestamp=1692194442478, current_stage='None', description=None, last_updated_timestamp=1692194442478, name='housing_price', run_id='58dd1fdb99ce4f26b9b60fe02129a74a', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/58dd1fdb99ce4f26b9b60fe02129a74a/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=6>

In [63]:
client.search_model_versions()

[<ModelVersion: aliases=[], creation_timestamp=1692194442478, current_stage='None', description=None, last_updated_timestamp=1692194442478, name='housing_price', run_id='58dd1fdb99ce4f26b9b60fe02129a74a', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/58dd1fdb99ce4f26b9b60fe02129a74a/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=6>,
 <ModelVersion: aliases=[], creation_timestamp=1692194365444, current_stage='None', description=None, last_updated_timestamp=1692194365444, name='housing_price', run_id='58dd1fdb99ce4f26b9b60fe02129a74a', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/58dd1fdb99ce4f26b9b60fe02129a74a/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=5>,
 <ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1692193821967, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_link='

In [72]:
latest_version = client.get_latest_versions(name=model_registry_name, stages=['None'])[0].version

In [73]:
client.transition_model_version_stage(
            name=model_registry_name,
            version=latest_version,
            stage="Staging",
            archive_existing_versions=False,
        )

<ModelVersion: aliases=[], creation_timestamp=1692194442478, current_stage='Staging', description=None, last_updated_timestamp=1692195196465, name='housing_price', run_id='58dd1fdb99ce4f26b9b60fe02129a74a', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/58dd1fdb99ce4f26b9b60fe02129a74a/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=6>

In [41]:
housing_details = {
    "ID": 343,
    "loc": "lagos",
    "title": "Mansion",
    "bedroom": 2.0,
    "bathroom": 1.0,
    "parking_space" : 2.0
}


In [42]:
x = pd.DataFrame([housing_details])

In [43]:
from prepare_features import prepare

In [44]:
prepare(x)

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,zone,rooms,bathroom_ratio
0,343,1.0,10,2.0,1.0,2.0,-1,3.0,0.333333


In [45]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = 'housing-price'

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

model_registry_name = "housing_price"

client = MlflowClient()
prod_model = client.get_latest_versions(model_registry_name, stages=["Production"])[0]

In [46]:
prod_model

<ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [47]:
prod_model.version

2

In [48]:
run_id = prod_model.run_id
run_id

'332da12b29be4a7fb4a05ce3e9e9d5ff'

In [49]:
logged_model = f'runs:/{run_id}/model'
logged_model

'runs:/332da12b29be4a7fb4a05ce3e9e9d5ff/model'

In [50]:
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [58]:
float(loaded_model.predict(prepare(x))[0])

1246.8361280347074

In [21]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.lightgbm
  run_id: 332da12b29be4a7fb4a05ce3e9e9d5ff

In [None]:
model_name = "housing_price"
latest_version = client.get_latest_versions(name=model_name)


latest_version[0].version

client.transition_model_version_stage(
    name=model_name,
    version=latest_version[0].version,
    stage="Staging",
    archive_existing_versions=False
)

In [93]:
client.search_model_versions()

[<ModelVersion: aliases=[], creation_timestamp=1691952653200, current_stage='None', description=None, last_updated_timestamp=1691952653200, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>,
 <ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>,
 <ModelVersion: aliases=[], creation_timestamp=1691853856691, current_stage='Staging', description='', last_updated_timestamp=1691951162335, name='housing_price', run_id='b3de1b48f0f7480c85468b6fb837bd97', run_li

In [66]:
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)

In [75]:
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)[0]

In [76]:
for run in runs:
    print(f'run_id:{run.info.run_id}, rmse:{run.data.metrics["rmse"]}')

AttributeError: 'tuple' object has no attribute 'info'

In [81]:
best_run_id = runs.info.run_id
best_run_id

'332da12b29be4a7fb4a05ce3e9e9d5ff'

In [82]:
best_run_metric = runs.data.metrics["rmse"]
best_run_metric

530561.7354094362

In [83]:
prod_model = client.get_latest_versions(model_registry_name, stages=["Production"])[0]
prod_model_run_id = prod_model.run_id
prod_model_run_id

'332da12b29be4a7fb4a05ce3e9e9d5ff'

In [84]:
prod_model

<ModelVersion: aliases=[], creation_timestamp=1691950576764, current_stage='Production', description=None, last_updated_timestamp=1691952104895, name='housing_price', run_id='332da12b29be4a7fb4a05ce3e9e9d5ff', run_link=None, source='/home/azureuser/MLOps_zoomcamp/mlruns/1/332da12b29be4a7fb4a05ce3e9e9d5ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [89]:
client.get_run(run_id=prod_model_run_id).data.metrics["rmse"]

530561.7354094362