In [1]:
import os

In [2]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection\\Exp'

In [3]:
os.chdir("../")

In [4]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_preprocess: Path
    test_preprocess: Path
    model_name: str
    target_column: str
    n_iter: int     
    cv_folds: int
    scoring: str 
    n_jobs: int

In [7]:
from project.constants import *
from project.utils.common import *

In [8]:
import os
import joblib   
import numpy as np
import dagshub
from project import logger
from project.entity.config_entity import ModelTrainerConfig

class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_PATH,
                schema_filepath=SCHEMA_PATH,
                params_filepath=PARAMS_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        schema = self.schema
        cv_params = self.params.cross_validation

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_preprocess=config.train_preprocess,
            test_preprocess=config.test_preprocess,
            model_name=config.model_name,
            target_column=schema.target_column.name,
            cv_folds=cv_params.cv_folds,            
            scoring=cv_params.scoring,             
            n_jobs=cv_params.n_jobs,
            n_iter=cv_params.n_iter          
        )
        
        return model_trainer_config 

In [9]:
import os
import json
import joblib
import mlflow
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import  accuracy_score, precision_score, recall_score,f1_score, roc_curve, auc, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
import optuna
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from project import logger
from project.utils.common import save_json, save_bin
from project.entity.config_entity import ModelTrainerConfig

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        dagshub.init(repo_owner="JavithNaseem-J", repo_name="E2E-Bank-Fraud-Detection")
        mlflow.set_tracking_uri("https://dagshub.com/JavithNaseem-J/E2E-Bank-Fraud-Detection.mlflow")
        mlflow.set_experiment("Fraud-Detection")

        self.models = {
            "XGBoost": {
                "class": XGBClassifier,
                "search_space": lambda trial: {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                },
                "mlflow_module": mlflow.xgboost,
            },
            "RandomForest": {
                "class": RandomForestClassifier,
                "search_space": lambda trial: {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                    "max_depth": trial.suggest_int("max_depth", 5, 20),
                },
                "mlflow_module": mlflow.sklearn,
            },
            "LogisticRegression": {
                "class": LogisticRegression,
                "search_space": lambda trial: {
                    "C": trial.suggest_float("C", 0.01, 10.0),
                    "max_iter": trial.suggest_int("max_iter", 100, 500),
                },
                "mlflow_module": mlflow.sklearn,
            },
        }

    def train(self):
        train_data = np.load(self.config.train_preprocess, allow_pickle=True)
        test_data = np.load(self.config.test_preprocess, allow_pickle=True)

        train_x = train_data[:, :-1]
        train_y = train_data[:, -1]

        best_overall = {"model_name": None, "score": 0, "std": 0, "params": None}

        for model_name, model_info in self.models.items():
            logger.info(f"Starting HPO for: {model_name}")

            with mlflow.start_run(run_name=f"{model_name}_HPO", nested=False):
                def objective(trial):
                    params = model_info["search_space"](trial)
                    model = model_info["class"](**params)
                    cv = StratifiedKFold(n_splits=self.config.cv_folds, shuffle=True, random_state=42)
                    scores = cross_val_score(
                        model, train_x, train_y,
                        scoring=self.config.scoring,
                        cv=cv,
                        n_jobs=self.config.n_jobs
                    )
                    mean_score = scores.mean()
                    std_score = scores.std()

                    with mlflow.start_run(run_name=f"{model_name}_trial_{trial.number}", nested=True):
                        mlflow.log_params(params)
                        mlflow.log_metric("cv_score", mean_score)
                        mlflow.log_metric("cv_std", std_score)
                        mlflow.set_tags({"model_name": model_name, "stage": "trial"})
                    return mean_score

                study = optuna.create_study(direction="maximize")
                study.optimize(objective, n_trials=self.config.n_iter)

                best_params = study.best_params
                best_score = study.best_value
                best_model = model_info["class"](**best_params)
                cv = StratifiedKFold(n_splits=self.config.cv_folds, shuffle=True, random_state=42)
                best_scores = cross_val_score(best_model, train_x, train_y, scoring=self.config.scoring, cv=cv)
                best_std = best_scores.std()

                mlflow.log_params(best_params)
                mlflow.log_metric("best_cv_score", best_score)
                mlflow.log_metric("best_cv_std", best_std)
                mlflow.set_tag("best_model_candidate", "true")

                if best_score > best_overall["score"]:
                    best_overall.update({
                        "model_name": model_name,
                        "score": best_score,
                        "std": best_std,
                        "params": best_params
                    })

        # Train final best model and log it separately
        best_model_class = self.models[best_overall["model_name"]]["class"]
        best_model = best_model_class(**best_overall["params"])
        best_model.fit(train_x, train_y)

        with mlflow.start_run(run_name=f"{best_overall['model_name']}_final"):
            mlflow.log_params(best_overall["params"])
            mlflow.log_metric("best_cv_score", best_overall["score"])
            mlflow.log_metric("best_cv_std", best_overall["std"])
            mlflow.set_tags({"model_name": best_overall["model_name"], "stage": "final"})

            mlflow.sklearn.log_model(
                best_model,
                artifact_path="model",
                registered_model_name=f"{best_overall['model_name']}_Model"
            )

            model_path = os.path.join(self.config.root_dir, self.config.model_name)
            save_bin(data=best_model, path=Path(model_path))

            best_model_info_path = os.path.join(self.config.root_dir, "best_model_info.json")
            mlflow.log_artifact(best_model_info_path)

        logger.info(f"Best model overall: {best_overall}")
        return best_overall


In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_training_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
    
except FileNotFoundError as e:
    logger.error(f'File Not Found: {e}')
except KeyError as e:
    logger.error(f'Key Error: {e}')
except AttributeError as e:
    logger.error(f'Attribute Error: {e}')
except Exception as e:
    logger.error(f'Unexpected error: {e}')

[2025-05-11 22:38:13,431: INFO: common: yaml file: config_file\config.yaml loaded successfully]


[2025-05-11 22:38:13,448: INFO: common: yaml file: config_file\schema.yaml loaded successfully]
[2025-05-11 22:38:13,453: INFO: common: yaml file: config_file\params.yaml loaded successfully]
[2025-05-11 22:38:13,456: INFO: common: created directory at: artifacts]
[2025-05-11 22:38:13,458: INFO: common: created directory at: artifacts/model_trainer]


[2025-05-11 22:38:14,228: INFO: helpers: Repository initialized!]


2025/05/11 22:38:15 INFO mlflow.tracking.fluent: Experiment with name 'Fraud-Detection' does not exist. Creating a new experiment.


[2025-05-11 22:38:15,712: INFO: 449458687: Starting HPO for: XGBoost]


[I 2025-05-11 22:38:16,147] A new study created in memory with name: no-name-9468a6bf-bcbf-4417-8886-df76d4d2780f
[W 2025-05-11 22:38:21,845] Trial 0 failed with parameters: {'n_estimators': 129, 'max_depth': 3, 'learning_rate': 0.09674916620029193} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Javith Naseem\AppData\Local\Temp\ipykernel_16300\449458687.py", line 72, in objective
    scores = cross_val_score(
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\sklearn\utils\_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "f:\ProgramFiles\anaconda3\envs\fraud-detection\lib\site-packages\sklearn\model_selection\_validation.py", line 684, in cross_val_score
    cv_results = cross_validate(
  File "f:\ProgramFiles\anacon

🏃 View run XGBoost_HPO at: https://dagshub.com/JavithNaseem-J/E2E-Bank-Fraud-Detection.mlflow/#/experiments/0/runs/bea61006e3f84500bdd28d4946e47f13
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Bank-Fraud-Detection.mlflow/#/experiments/0


KeyboardInterrupt: 