In [1]:
import os

In [2]:
pwd%%

'f:\\FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection\\Exp'

In [3]:
os.chdir("../")

In [4]:
pwd%%

'f:\\FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_preprocess: Path
    test_preprocess: Path
    model_name: str
    target_column: str
    n_iter: int     
    cv_folds: int
    scoring: str 
    n_jobs: int

In [None]:
from FraudGuard.constants import *
from FraudGuard.utils.helpers import *
from FraudGuard.utils.exceptions import *

In [None]:
import os
import joblib   
import numpy as np
from FraudGuard import logger

class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_PATH,
                schema_filepath=SCHEMA_PATH,
                params_filepath=PARAMS_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        schema = self.schema
        cv_params = self.params.cross_validation

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_preprocess=config.train_preprocess,
            test_preprocess=config.test_preprocess,
            model_name=config.model_name,
            target_column=schema.target_column.name,
            cv_folds=cv_params.cv_folds,            
            scoring=cv_params.scoring,             
            n_jobs=cv_params.n_jobs,
            n_iter=cv_params.n_iter          
        )
        
        return model_trainer_config 

In [None]:
import os
import json
import joblib
import mlflow
import pandas as pd
from pathlib import Path
import numpy as np
import dagshub
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
import optuna
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        dagshub.init(repo_owner="JavithNaseem-J", repo_name="FraudGuard")
        mlflow.set_tracking_uri("https://dagshub.com/JavithNaseem-J/FraudGuard.mlflow")
        mlflow.set_experiment("Fraud-Detection")

        self.models = {
                        "XGBoost": {
                            "class": XGBClassifier,
                            "search_space": lambda trial: {
                                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                                "max_depth": trial.suggest_int("max_depth", 3, 10),
                                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                            },
                            "mlflow_module": mlflow.xgboost,
                        },
                        "CatBoost": {
                            "class": CatBoostClassifier,
                            "search_space": lambda trial: {
                                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                                "max_depth": trial.suggest_int("max_depth", 5, 10),
                                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                            },
                            "mlflow_module": mlflow.catboost,
                        },
                        "LightGBM": {
                            "class": LGBMClassifier,
                            "search_space": lambda trial: {
                                "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                            },
                            "mlflow_module": mlflow.lightgbm,
                        },
                    }

    def train(self):
        train_data = np.load(self.config.train_preprocess, allow_pickle=True)
        test_data = np.load(self.config.test_preprocess, allow_pickle=True)

        train_x = train_data[:, :-1]
        train_y = train_data[:, -1]

        best_overall = {"model_name": None, "score": 0, "std": 0, "params": None}

        for model_name, model_info in self.models.items():
            logger.info(f"Starting HPO for: {model_name}")

            with mlflow.start_run(run_name=f"{model_name}_HPO", nested=False):
                def objective(trial):
                    params = model_info["search_space"](trial)
                    model = model_info["class"](**params)
                    cv = StratifiedKFold(n_splits=self.config.cv_folds, shuffle=True, random_state=42)
                    scores = cross_val_score(
                        model, train_x, train_y,
                        scoring=self.config.scoring,
                        cv=cv,
                        n_jobs=self.config.n_jobs
                    )
                    mean_score = scores.mean()
                    std_score = scores.std()

                    with mlflow.start_run(run_name="Trial", nested=True):
                        mlflow.log_params(params)
                        mlflow.log_metric("cv_score", mean_score)
                        mlflow.log_metric("cv_std", std_score)
                        mlflow.set_tags({
                            "model_name": model_name,
                            "trial_number": trial.number,
                            "stage": "HPO"
                        })
                    return mean_score

                study = optuna.create_study(direction="maximize")
                study.optimize(objective, n_trials=self.config.n_iter)

                best_params = study.best_params
                best_score = study.best_value
                best_model = model_info["class"](**best_params)
                cv = StratifiedKFold(n_splits=self.config.cv_folds, shuffle=True, random_state=42)
                best_scores = cross_val_score(best_model, train_x, train_y, scoring=self.config.scoring, cv=cv)
                best_std = best_scores.std()

                mlflow.log_params(best_params)
                mlflow.log_metric("best_cv_score", best_score)
                mlflow.log_metric("best_cv_std", best_std)
                mlflow.set_tag("best_model_candidate", "true")

                if best_score > best_overall["score"]:
                    best_overall.update({
                        "model_name": model_name,
                        "score": best_score,
                        "std": best_std,
                        "params": best_params
                    })

        # Train final best model and log it separately
        best_model_class = self.models[best_overall["model_name"]]["class"]
        best_model = best_model_class(**best_overall["params"])
        best_model.fit(train_x, train_y)

        with mlflow.start_run(run_name=f"{best_overall['model_name']}_final"):
            mlflow.log_params(best_overall["params"])
            mlflow.log_metric("best_cv_score", best_overall["score"])
            mlflow.log_metric("best_cv_std", best_overall["std"])
            mlflow.set_tags({"model_name": best_overall["model_name"], "stage": "final"})

            mlflow.sklearn.log_model(
                best_model,
                artifact_path="model",
                registered_model_name=f"{best_overall['model_name']}_Model"
            )

            model_path = os.path.join(self.config.root_dir, self.config.model_name)
            save_bin(data=best_model, path=model_path)

            best_model_info_path = os.path.join(self.config.root_dir, "best_model_info.json")
            save_json(path=best_model_info_path, data=Path(best_overall))
            mlflow.log_artifact(best_model_info_path)

        logger.info(f"Best model overall: {best_overall}")
        return best_overall


In [None]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_training_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
    
except Exception as e:
    raise CustomException(str(e), sys)

[2025-05-12 15:39:56,916: INFO: common: yaml file: config_file\config.yaml loaded successfully]
[2025-05-12 15:39:56,982: INFO: common: yaml file: config_file\schema.yaml loaded successfully]
[2025-05-12 15:39:57,017: INFO: common: yaml file: config_file\params.yaml loaded successfully]
[2025-05-12 15:39:57,032: INFO: common: created directory at: artifacts]
[2025-05-12 15:39:57,032: INFO: common: created directory at: artifacts/model_trainer]


[2025-05-12 15:39:57,944: INFO: helpers: Repository initialized!]
[2025-05-12 15:39:58,434: INFO: 1395279945: Starting HPO for: XGBoost]


[I 2025-05-12 15:39:58,811] A new study created in memory with name: no-name-0de300c0-2368-46fb-a149-408b2a2f56f9


🏃 View run Trial at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/dd742af7400240b3aa203f2fd07df2b1
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


[I 2025-05-12 15:40:18,956] Trial 0 finished with value: 0.891695637886899 and parameters: {'n_estimators': 220, 'max_depth': 5, 'learning_rate': 0.048227869579621156}. Best is trial 0 with value: 0.891695637886899.


🏃 View run Trial at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/e07f919d1c474b82a03afe5b390f0438
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


[I 2025-05-12 15:40:37,513] Trial 1 finished with value: 0.8478326713134429 and parameters: {'n_estimators': 190, 'max_depth': 10, 'learning_rate': 0.016996154121226105}. Best is trial 0 with value: 0.891695637886899.


🏃 View run Trial at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/f7f6e09350484261ace986ac6e71d5d2
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


[I 2025-05-12 15:40:48,246] Trial 2 finished with value: 0.9480459520175216 and parameters: {'n_estimators': 296, 'max_depth': 9, 'learning_rate': 0.21584120729946096}. Best is trial 2 with value: 0.9480459520175216.


🏃 View run Trial at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/47c5c07f851e438d8171617ae4906aae
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


[I 2025-05-12 15:40:54,298] Trial 3 finished with value: 0.9434061482928587 and parameters: {'n_estimators': 142, 'max_depth': 9, 'learning_rate': 0.26969484681190603}. Best is trial 2 with value: 0.9480459520175216.


🏃 View run Trial at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/ae10c80589ab408eb7945de3f42469ae
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


[I 2025-05-12 15:40:59,470] Trial 4 finished with value: 0.8807693009240364 and parameters: {'n_estimators': 206, 'max_depth': 5, 'learning_rate': 0.046245484576534795}. Best is trial 2 with value: 0.9480459520175216.


🏃 View run Trial at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/7cfcf31149f74648bcc242f6d44445a1
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


[W 2025-05-12 15:41:05,114] Trial 5 failed with parameters: {'n_estimators': 154, 'max_depth': 8, 'learning_rate': 0.29392782351547636} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Javith Naseem\AppData\Local\Temp\ipykernel_21276\1395279945.py", line 84, in objective
    mlflow.log_metric("cv_score", mean_score)
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\mlflow\tracking\fluent.py", line 923, in log_metric
    return MlflowClient().log_metric(
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\mlflow\tracking\client.py", line 1972, in log_metric
    return self._tracking_client.log_metric(
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\mlflow\tracking\_tracking_service\client.py", line 672, in l

🏃 View run XGBoost_HPO at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0/runs/08474fcf3ca54cb197b9e81255aa2da9
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/FraudGuard-End-to-End-Bank-Transaction-Fraud-Detection.mlflow/#/experiments/0


KeyboardInterrupt: 