In [1]:
import os
%pwd

'd:\\Data Science\\END to END Proj\\Fertilizer_Pred_MLOPS\\research'

In [2]:
os.chdir("../")

In [3]:
import dagshub
dagshub.init(repo_owner='gowtham-dd', repo_name='Fertilizer_Pred_MLOPS', mlflow=True)

import mlflow
with mlflow.start_run():
  mlflow.log_param('parameter name', 'value')
  mlflow.log_metric('metric name', 1)

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_dir: Path  # Directory containing xgb_fold0.bin, xgb_fold1.bin, etc.
    metric_file_name: Path
    target_column: str
    mlflow_uri: str
    label_encoder_path: Path
    all_params: dict

In [5]:
from src.Fertilizer_Pred.utils.common import read_yaml, create_directories
from src.Fertilizer_Pred.constant import *
from pathlib import Path

class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.XGBoost  
        return ModelEvaluationConfig(
            root_dir=Path(config.root_dir),
            test_data_path=Path(config.test_data_path),
            model_dir=Path(config.model_dir),
            metric_file_name=Path(config.metric_file_name),
            target_column=self.schema.TARGET_COLUMN.name,
            mlflow_uri=config.mlflow_uri,
            label_encoder_path=Path(config.label_encoder_path),
            all_params=params
        )


In [6]:
import json

def save_json(path: Path, data: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)



In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
from urllib.parse import urlparse
from sklearn.preprocessing import OrdinalEncoder
from typing import List
import joblib
from pathlib import Path
from src.Fertilizer_Pred.utils.common import save_json  # Adjust import to your actual utility location

class ModelEvaluation:
    def __init__(self, config):
        self.config = config

    def map_at_k(self, actual: np.ndarray, predicted: np.ndarray, k: int = 3) -> float:
        total_score = 0.0
        for true_idx, pred_top_k in zip(actual, predicted):
            if true_idx in pred_top_k[:k]:
                rank = np.where(pred_top_k[:k] == true_idx)[0][0] + 1
                total_score += 1.0 / rank
        return total_score / len(actual)

    def _load_models(self) -> List[xgb.Booster]:
        model_paths = list(self.config.model_dir.glob("xgb_fold*.bin"))
        models = []
        for path in model_paths:
            model = xgb.Booster()
            model.load_model(str(path))
            models.append(model)
        return models

    def evaluate_ensemble(self):
        test_data = pd.read_csv(self.config.test_data_path)

        # Load label encoder
        label_encoder_path = self.config.label_encoder_path
        if not label_encoder_path.exists():
            raise FileNotFoundError(f"Label encoder not found at: {label_encoder_path}")
        le = joblib.load(label_encoder_path)

        # Extract X_test and optionally y_test
        if self.config.target_column in test_data.columns:
            y_test = test_data[self.config.target_column]
            y_test_encoded = le.transform(y_test)
            X_test = test_data.drop(columns=[self.config.target_column])
        else:
            y_test_encoded = None
            X_test = test_data.copy()

        # Ordinal encode object columns
        obj_cols = X_test.select_dtypes(include='object').columns
        if len(obj_cols) > 0:
            X_test[obj_cols] = OrdinalEncoder().fit_transform(X_test[obj_cols])

        dtest = xgb.DMatrix(X_test)

        models = self._load_models()
        pred_probs = np.mean([model.predict(dtest) for model in models], axis=0)
        top3_preds = np.argsort(-pred_probs, axis=1)[:, :3]

        if y_test_encoded is not None:
            map3 = self.map_at_k(y_test_encoded, top3_preds)
            acc = np.mean(y_test_encoded == top3_preds[:, 0])
            return {"MAP@3": map3, "accuracy": acc}
        else:
            pred_labels = le.inverse_transform(np.argmax(pred_probs, axis=1))
            pd.DataFrame(pred_labels, columns=["Predicted Fertilizer"]).to_csv(
                self.config.root_dir / "predictions.csv", index=False
            )
            return {"status": "Prediction complete. Evaluation skipped (no true labels)."}

    def log_into_mlflow(self):
        mlflow.set_tracking_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        with mlflow.start_run():
            # Evaluation
            metrics = self.evaluate_ensemble()
            save_json(path=self.config.metric_file_name, data=metrics)

            # Log hyperparameters
            mlflow.log_params(self.config.all_params)

            # Log metrics
            for key, value in metrics.items():
                try:
                    mlflow.log_metric(key, float(value))
                except Exception as e:
                    print(f"[MLflow] Failed to log metric {key}: {e}")

            # Log label encoder
            if self.config.label_encoder_path.exists():
                mlflow.log_artifact(str(self.config.label_encoder_path), artifact_path="label_encoder")

            # Log each model
            if tracking_url_type_store != "file":
                for i, model_path in enumerate(self.config.model_dir.glob("xgb_fold*.bin")):
                    booster_model = xgb.Booster()
                    booster_model.load_model(str(model_path))

                    mlflow.xgboost.log_model(
                        xgb_model=booster_model,
                        artifact_path=f"model_fold_{i}",
                        registered_model_name=f"XGBoost_Fertilizer_Fold_{i}"
                    )


In [12]:
try:
    # Initialize configuration manager
    config = ConfigurationManager()
    
    # Get evaluation config
    model_evaluation_config = config.get_model_evaluation_config()
    
    # Initialize and run model evaluation
    model_evaluator = ModelEvaluation(config=model_evaluation_config)
    model_evaluator.log_into_mlflow()

except Exception as e:
    raise e


[2025-07-10 15:12:14,657: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-10 15:12:14,660: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-10 15:12:14,672: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-07-10 15:12:14,675: INFO: common: created directory at: artifacts]
[2025-07-10 15:20:39,584: INFO: common: json file saved at: artifacts\model_evaluation\metrics.json]
[MLflow] Failed to log metric MAP@3: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}


Successfully registered model 'XGBoost_Fertilizer_Fold_0'.
2025/07/10 15:24:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: XGBoost_Fertilizer_Fold_0, version 1
Created version '1' of model 'XGBoost_Fertilizer_Fold_0'.
Successfully registered model 'XGBoost_Fertilizer_Fold_1'.
2025/07/10 15:28:15 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: XGBoost_Fertilizer_Fold_1, version 1
Created version '1' of model 'XGBoost_Fertilizer_Fold_1'.
