In [1]:
import os

In [2]:
pwd%%

'f:\\Files\\DS&ML\\FraudGuard\\notebooks'

In [3]:
os.chdir("../")

In [4]:
pwd%%

'f:\\Files\\DS&ML\\FraudGuard'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_preprocess: Path
    test_preprocess: Path
    model_name: str
    target_column: str
    n_iter: int     
    cv_folds: int
    scoring: str 
    n_jobs: int
    mlflow_username: str
    mlflow_password: str

In [7]:
from FraudGuard.constants import *
from FraudGuard.utils.helpers import *
from FraudGuard.utils.exceptions import *

In [8]:
import os
import joblib   
import numpy as np
from FraudGuard import logger

class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_PATH,
                schema_filepath=SCHEMA_PATH,
                params_filepath=PARAMS_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        schema = self.schema
        cv_params = self.params.cross_validation
        mlflow_params = self.params.mlflow

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_preprocess=config.train_preprocess,
            test_preprocess=config.test_preprocess,
            model_name=config.model_name,
            target_column=schema.target_column.name,
            cv_folds=cv_params.cv_folds,            
            scoring=cv_params.scoring,             
            n_jobs=cv_params.n_jobs,
            n_iter=cv_params.n_iter,
            mlflow_username=mlflow_params.mlflow_username,
            mlflow_password=mlflow_params.mlflow_username          
        )
        
        return model_trainer_config 

In [None]:
import os
import numpy as np
import optuna
import mlflow
import dagshub
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from FraudGuard import logger
from FraudGuard.utils.helpers import save_bin, save_json
from FraudGuard.entity.config_entity import ModelTrainerConfig

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

        os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.mlflow_username
        os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.mlflow_password

        dagshub.init(repo_owner="JavithNaseem-J", repo_name="FraudGuard")
        mlflow.set_tracking_uri("https://dagshub.com/JavithNaseem-J/FraudGuard.mlflow")
        mlflow.set_experiment("Fraud-Detection")


        # Supported models and their Optuna search spaces
        self.models = {
            "XGBoost": {
                "class": XGBClassifier,
                "search_space": lambda t: {
                    "n_estimators": t.suggest_int("n_estimators", 50, 300),
                    "max_depth": t.suggest_int("max_depth", 3, 10),
                    "learning_rate": t.suggest_float("learning_rate", 0.01, 0.3),
                    "verbosity": 0,
                    "use_label_encoder": False,
                },
                "mlflow_module": mlflow.xgboost,
            },
            "CatBoost": {
                "class": CatBoostClassifier,
                "search_space": lambda t: {
                    "n_estimators": t.suggest_int("n_estimators", 50, 300),
                    "max_depth": t.suggest_int("max_depth", 5, 10),
                    "learning_rate": t.suggest_float("learning_rate", 0.01, 0.3),
                    "verbose": 0,
                    "allow_writing_files": False,
                },
                "mlflow_module": mlflow.catboost,
            },
            "LightGBM": {
                "class": LGBMClassifier,
                "search_space": lambda t: {
                    "num_leaves": t.suggest_int("num_leaves", 20, 150),
                    "learning_rate": t.suggest_float("learning_rate", 0.01, 0.3),
                    "n_estimators": t.suggest_int("n_estimators", 50, 300),
                    "verbosity": -1,
                },
                "mlflow_module": mlflow.lightgbm,
            },
        }

    def train(self):
        train_arr = np.load(self.config.train_preprocess, allow_pickle=True)
        X, y = train_arr[:, :-1], train_arr[:, -1]

        best_overall = {"model": None, "score": 0, "std": 0, "params": None}

        for name, info in self.models.items():
            logger.info(f"🔍  Starting HPO for {name}")

            def objective(trial):
                params = info["search_space"](trial)
                model = info["class"](**params)
                cv = StratifiedKFold(
                    n_splits=self.config.cv_folds,
                    shuffle=True,
                    random_state=42,
                )
                scores = cross_val_score(
                    model,
                    X,
                    y,
                    scoring=self.config.scoring,
                    cv=cv,
                    n_jobs=self.config.n_jobs,
                )
                mlflow.log_metric("cv_score", scores.mean())
                mlflow.log_metric("cv_std", scores.std())
                mlflow.log_params(params)
                return scores.mean()

            study = optuna.create_study(direction="maximize")
            with mlflow.start_run(run_name=f"{name}_HPO"):
                study.optimize(objective, n_trials=self.config.n_iter)

            best_params = study.best_params
            best_score = study.best_value
            best_std = study.best_trial.user_attrs.get("cv_std", 0)

            if (best_score > best_overall["score"]) or (
                best_score == best_overall["score"] and best_std < best_overall["std"]
            ):
                best_overall.update(
                    dict(model=name, score=best_score, std=best_std, params=best_params)
                )

        logger.info(f"🏆 Best model: {best_overall}")
        final_cls = self.models[best_overall["model"]]["class"]
        final_model = final_cls(**best_overall["params"])
        final_model.fit(X, y)

        model_path = Path(self.config.root_dir) / self.config.model_name
        save_bin(final_model, model_path)
        save_json(
            Path(self.config.root_dir) / "best_model_info.json", best_overall
        )

        with mlflow.start_run(run_name=f"{best_overall['model']}_final"):
            mlflow.log_params(best_overall["params"])
            mlflow.log_metric("best_cv_score", best_overall["score"])
            mlflow.log_metric("best_cv_std", best_overall["std"])
            mlflow.set_tag("stage", "final")
            mlflow.sklearn.log_model(
                final_model,
                artifact_path="model",
                registered_model_name=f"{best_overall['model']}_Model",
            )

        return best_overall

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_training_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
    
except Exception as e:
    raise CustomException(str(e), sys)

[2025-06-13 17:17:13,554: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-06-13 17:17:13,559: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-06-13 17:17:13,561: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-06-13 17:17:13,561: INFO: helpers: created directory at: artifacts]
[2025-06-13 17:17:13,569: INFO: helpers: created directory at: artifacts/model_trainer]


[2025-06-13 17:17:14,747: INFO: helpers: Repository initialized!]
[2025-06-13 17:17:15,372: INFO: 769789220: 🔍  Starting HPO for XGBoost]


[I 2025-06-13 17:17:15,372] A new study created in memory with name: no-name-dc09c8b0-f713-43dc-9cbc-2ad24238d378
[I 2025-06-13 17:17:29,408] Trial 0 finished with value: 0.9315277627271495 and parameters: {'n_estimators': 298, 'max_depth': 8, 'learning_rate': 0.07215030252167107}. Best is trial 0 with value: 0.9315277627271495.
[W 2025-06-13 17:17:36,821] Trial 1 failed with parameters: {'n_estimators': 158, 'max_depth': 6, 'learning_rate': 0.24409830344144856} because of the following error: RestException("INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}").
Traceback (most recent call last):
  File "c:\Users\Javith Naseem\.conda\envs\FraudGraud\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Javith Naseem\AppData\Local\Temp\ipykernel_19540\769789220.py", line 94, in objective
    mlflow.log_params(params)
  File "c:\Users\Javith Naseem\.conda\envs\FraudGraud\lib\site-packages\mlflow\t

🏃 View run XGBoost_HPO at: https://dagshub.com/JavithNaseem-J/FraudGuard.mlflow/#/experiments/0/runs/b6c4612b082240658d1be3a67e8c645c
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard.mlflow/#/experiments/0


CustomException: Exception in C:\Users\Javith Naseem\AppData\Local\Temp\ipykernel_19540\645348306.py, line 5: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}