In [1]:
import os
import pickle

In [2]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Regression-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Regression-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    metric_file_name: Path
    mlflow_uri: str
    transformed_data_test: Path
    transformed_data_train: Path
    model_path: Path
    all_params: dict
    target_column: str

@dataclass(frozen=True)
class DagsHubConfig:
    repo_owner: str
    repo_name: str
    mlflow: bool


In [6]:
from RegressionProject.constants import *
from RegressionProject.utils.common import read_yaml, create_directories, save_json, load_object_pkl, read_transformed_data

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_dags_hub_config(self) -> DagsHubConfig:
        config_dags_hub = self.config.dags_hub

        dags_hub_config = DagsHubConfig(
            repo_owner=config_dags_hub.repo_owner,
            repo_name = config_dags_hub.repo_name,
            mlflow=config_dags_hub.mlflow,
        )
        return dags_hub_config
    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config_model_eval = self.config.model_evaluation
        config_model_train = self.config.model_trainer
        config_model_transform = self.config.data_transformation

        params = self.params
        schema =  self.schema.TARGET_COLUMN

        create_directories([config_model_eval.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config_model_eval.root_dir,
            metric_file_name = config_model_eval.metric_file_name,
            mlflow_uri=config_model_eval.mlflow_uri,
            transformed_data_test=config_model_transform.transformed_data_test,
            transformed_data_train=config_model_transform.transformed_data_train,
            model_path = config_model_train.trained_model_file_path,
            target_column = schema.name,
            all_params=params,       
        )
        return model_evaluation_config

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from urllib.parse import urlparse
import dagshub
import mlflow.sklearn
import numpy as np

In [33]:
import subprocess
from RegressionProject.utils.common import load_best_model_from_json

class ModelEvaluation:
    def __init__(self, evaluation_config: ModelEvaluationConfig , dags_hub_config: DagsHubConfig):
        self.evaluation_config = evaluation_config
        self.dags_hub_config = dags_hub_config
    
    def eval_metrics(self,actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2


    def get_git_info(self):
        repo_url = subprocess.check_output(["git", "config", "--get", "remote.origin.url"]).strip().decode()
        commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode()
        branch_name = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).strip().decode()
        return repo_url, commit_hash, branch_name
        
    def log_into_mlflow(self):

        repo_url, commit_hash, branch_name=self.get_git_info()
        
        dagshub.init(repo_owner=self.dags_hub_config.repo_owner, repo_name=self.dags_hub_config.repo_name, mlflow=self.dags_hub_config.mlflow)

        _, _, test_x, test_y = read_transformed_data(self.evaluation_config.transformed_data_train, self.evaluation_config.transformed_data_test)

        # Load the model from the pickle file
        model=load_object_pkl(self.evaluation_config.model_path)
        
        mlflow.set_registry_uri(self.evaluation_config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        predicted_math_scores = model.predict(test_x)
        rmse, mae, r2 = self.eval_metrics(test_y, predicted_math_scores)

        # Saving metrics as local
        scores = {"rmse": rmse, "mae": mae, "r2": r2}
        save_json(path=Path(self.evaluation_config.metric_file_name), data=scores)




        best_model_name, results = load_best_model_from_json("artifacts/model_trainer/train_results.json")

        with mlflow.start_run():
            mlflow.log_artifact('params.yaml', artifact_path="all_models_params/all_models_and_params")
            mlflow.log_artifact('artifacts/model_trainer/train_results.json', artifact_path="all_models_params/best_models_params")
            mlflow.log_artifact('schema.yaml', artifact_path="schema")
            mlflow.log_artifact('requirements.txt', artifact_path="training_requirements")
            mlflow.log_artifact('config/config.yaml', artifact_path="configuration")
            mlflow.log_artifact('logs/running_logs.log', artifact_path="training_logs")
            
            mlflow.log_param("best_model_name", best_model_name)
            mlflow.log_param("repo_url", repo_url)
            mlflow.log_param("branch_name", branch_name)
            mlflow.log_param("commit_hash", commit_hash)
            for param in results[best_model_name]['best_params']:
                mlflow.log_param(param, results[best_model_name]['best_params'][param])
                
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("mae", mae)
            
            




        # Model registry does not work with file store
            if tracking_url_type_store != "file":

                # Register the model
                # There are other ways to use the Model Registry, which depends on the use case,
                # please refer to the doc for more information:
                # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                mlflow.sklearn.log_model(model, "model", registered_model_name="best_regression_model")
            else:
                mlflow.sklearn.log_model(model, "model")



In [34]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    dags_hub_config = config.get_dags_hub_config()
    model_evaluation_config = ModelEvaluation(model_evaluation_config,dags_hub_config)
    model_evaluation_config.log_into_mlflow()
except Exception as e:
    raise e

[2024-07-05 12:48:24,395: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-05 12:48:24,404: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 12:48:24,410: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 12:48:24,413: INFO: common: created directory at: artifacts]
[2024-07-05 12:48:24,416: INFO: common: created directory at: artifacts/model_evaluation]
[2024-07-05 12:48:25,909: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/iheb.aamrii/Mlops-Regression-Project "HTTP/1.1 200 OK"]


[2024-07-05 12:48:25,920: INFO: helpers: Initialized MLflow to track repo "iheb.aamrii/Mlops-Regression-Project"]


[2024-07-05 12:48:25,925: INFO: helpers: Repository iheb.aamrii/Mlops-Regression-Project initialized!]
[2024-07-05 12:48:25,957: INFO: common: json file saved at: artifacts\model_evaluation\metrics.json]


Registered model 'best_regression_model' already exists. Creating a new version of this model...
2024/07/05 12:48:41 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: best_regression_model, version 26
Created version '26' of model 'best_regression_model'.


In [None]:
# Version Control: Use version control for datasets. Each version of the dataset should be immutable and identifiable via a unique version ID or timestamp.
# dataset used to train and the one used for  test // what about transformed data ?
# Log Data Checksums or Hashes:
#Calculate and log hashes of the split datasets to verify their integrity.
#Log Metadata about Split Datasets:
#Log metadata such as the number of records in each split, basic statistics, etc.


#Log Split Parameters
#Parameters like split ratio, random seed, stratification details, etc., should be logged. (reproduce or re-get used data train / test transformed or not )
# Log artifacts such as the trained model, plots, or any files generated during the experiment. log artifacts or pointer to them of each step
# Log the environment details such as the versions of libraries used. and what else ?




phone notes
read the last conversation
https://chatgpt.com/c/9c12fd3e-7eb8-468b-89ed-fd43b435603e


compare model evaluation to the 3 projects 
compare model prediction and app.py to the 3 projects
use notebooks code to update the modularized code : test notebooks before

add dvc 
add airflow or zenml or kubeflow
Vault
feature store
dockerize  the solution
kubernetess
cI/CD
graphana prometheus for monitoring and elk for logs 


In [None]:
#Predict from logged model
_, _, test_x, test_y = read_transformed_data('artifacts/data_transformation/transformed_train_data.csv', 'artifacts/data_transformation/transformed_test_data.csv')


import mlflow
logged_model = 'runs:/3803771e0c3248debaa97c8ae4aa7412/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame(test_x))