In [1]:
import os

In [2]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection\\Exp'

In [3]:
os.chdir("../")

In [4]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_preprocess: Path
    test_preprocess: Path
    model_name: str
    target_column: str
    random_search_params: dict
    n_iter: int     
    cv_folds: int
    scoring: str 
    n_jobs: int

In [7]:
from project.constants import *
from project.utils.common import *

In [8]:
import os
import joblib   
import numpy as np
import mlflow
import mlflow.xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import dagshub
from project import logger
from project.entity.config_entity import ModelTrainerConfig

class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_PATH,
                schema_filepath=SCHEMA_PATH,
                params_filepath=PARAMS_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBClassifier
        schema = self.schema
        random_search_params = params.random_search
        cv_params = params.cross_validation

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_preprocess=config.train_preprocess,
            test_preprocess=config.test_preprocess,
            model_name=config.model_name,
            target_column=schema.target_column.name,
            random_search_params=random_search_params, 
            cv_folds=cv_params.cv_folds,            
            scoring=cv_params.scoring,             
            n_jobs=cv_params.n_jobs,
            n_iter=cv_params.n_iter          
        )
        
        return model_trainer_config 

In [None]:
from sklearn.model_selection import StratifiedKFold
import optuna

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        dagshub.init(repo_owner="JavithNaseem-J", repo_name="E2E-Credit-Fraud-Detection")
        mlflow.set_tracking_uri("https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow")
        mlflow.set_experiment("E2E-Credit-Fraud-Detection")

    def train(self):
        # Validate file paths
        if not os.path.exists(self.config.train_preprocess):
            logger.error(f"Train preprocessed file not found at: {self.config.train_preprocess}")
            raise FileNotFoundError("Train preprocessed file not found")
        if not os.path.exists(self.config.test_preprocess):
            logger.error(f"Test preprocessed file not found at: {self.config.test_preprocess}")
            raise FileNotFoundError("Test preprocessed file not found")

        # Load preprocessed data
        train_data = np.load(self.config.train_preprocess, allow_pickle=True)
        test_data = np.load(self.config.test_preprocess, allow_pickle=True)

        logger.info(f'Loaded train and test data')
        logger.info(f'Train data shape: {train_data.shape}')
        logger.info(f'Test data shape: {test_data.shape}')

        train_x = train_data[:, :-1]
        train_y = train_data[:, -1]
        test_x = test_data[:, :-1]
        test_y = test_data[:, -1]

        # Optuna Optimization
        mlflow.xgboost.autolog()
        with mlflow.start_run(run_name="Optuna_HPO") as parent_run:
            parent_run_id = parent_run.info.run_id
            mlflow.set_tag("run_type", "hyperparameter_tuning")
            mlflow.set_tag("model", "XGBClassifier")

            def get_search_space(trial):
                return {
                    "n_estimators": trial.suggest_int("n_estimators", 50, 500),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                    "max_depth": trial.suggest_int("max_depth", 3, 10),
                    "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
                    "gamma": trial.suggest_float("gamma", 0.0, 0.3),
                    "subsample": trial.suggest_float("subsample", 0.7, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
                    "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
                    "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
                }

            def objective(trial):
                with mlflow.start_run(run_name=f"Trial_{trial.number}", nested=True):
                    mlflow.set_tag("mlflow.parentRunId", parent_run_id)
                    mlflow.set_tag("trial_number", trial.number)

                    params = get_search_space(trial)

                    model = XGBClassifier(
                        objective='binary:logistic',
                        verbosity=0,
                        eval_metric='logloss',
                        use_label_encoder=False,
                        **params
                    )

                    cv = StratifiedKFold(n_splits=self.config.cv_folds, shuffle=True, random_state=42)
                    cv_scores = cross_val_score(model, train_x, train_y, scoring=self.config.scoring, cv=cv, n_jobs=self.config.n_jobs)

                    mean_score = cv_scores.mean()
                    std_score = cv_scores.std()

                    mlflow.log_params(params)
                    mlflow.log_metric("cv_mean_accuracy", mean_score)
                    mlflow.log_metric("cv_std_accuracy", std_score)

                    logger.info(f"Trial {trial.number}: cv_mean_accuracy={mean_score:.4f} (+/- {std_score:.4f}), params={params}")

                    return mean_score

            logger.info('>>>>>>>>>> Starting Optuna Study <<<<<<<<<')

            study = optuna.create_study(direction="maximize")
            study.optimize(objective, n_trials=self.config.n_iter)

            logger.info(f"Best trial found: {study.best_trial.params} with accuracy {study.best_trial.value:.4f}")

            # Retrain best model on full training data
            best_params = study.best_trial.params
            best_model = XGBClassifier(
                objective='binary:logistic',
                verbosity=0,
                eval_metric='logloss',
                use_label_encoder=False,
                **best_params
            )
            best_model.fit(train_x, train_y)

            mlflow.xgboost.log_model(
                xgb_model=best_model,
                artifact_path="xgboost_model",
                registered_model_name="XGBClassifier_CreditFraud_Optuna"
            )
            logger.info("Best model logged to MLflow")

            # Save the model locally
            model_path = os.path.join(self.config.root_dir, self.config.model_name)
            joblib.dump(best_model, model_path)
            logger.info(f'Best model saved locally at {model_path}')


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_training_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
    
except FileNotFoundError as e:
    logger.error(f'File Not Found: {e}')
except KeyError as e:
    logger.error(f'Key Error: {e}')
except AttributeError as e:
    logger.error(f'Attribute Error: {e}')
except Exception as e:
    logger.error(f'Unexpected error: {e}')

[2025-04-25 15:59:29,088: INFO: common: yaml file: yaml file\config.yaml loaded successfully]
[2025-04-25 15:59:29,099: INFO: common: yaml file: yaml file\schema.yaml loaded successfully]
[2025-04-25 15:59:29,111: INFO: common: yaml file: yaml file\params.yaml loaded successfully]
[2025-04-25 15:59:29,111: INFO: common: created directory at: artifacts]
[2025-04-25 15:59:29,111: INFO: common: created directory at: artifacts/model_trainer]
[2025-04-25 15:59:29,510: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-04-25 15:59:29,527: INFO: helpers: Accessing as JavithNaseem-J]
[2025-04-25 15:59:30,023: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/JavithNaseem-J/E2E-Credit-Fraud-Detection "HTTP/1.1 200 OK"]
[2025-04-25 15:59:30,434: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-04-25 15:59:30,444: INFO: helpers: Initialized MLflow to track repo "JavithNaseem-J/E2E-Credit-Fraud-Detection"]


[2025-04-25 15:59:30,449: INFO: helpers: Repository JavithNaseem-J/E2E-Credit-Fraud-Detection initialized!]
[2025-04-25 15:59:31,272: INFO: 3253257748: Loaded train and test data]
[2025-04-25 15:59:31,272: INFO: 3253257748: Train data shape: (69840, 10)]
[2025-04-25 15:59:31,280: INFO: 3253257748: Test data shape: (17460, 10)]
[2025-04-25 15:59:32,690: INFO: 3253257748: >>>>>>>>>> Starting Optuna Study <<<<<<<<<]


[I 2025-04-25 15:59:32,690] A new study created in memory with name: no-name-7ee6c9a9-6d24-4b88-9af1-3497690184d8


[2025-04-25 15:59:50,410: INFO: 3253257748: Trial 0: cv_mean_accuracy=0.9445 (+/- 0.0021), params={'n_estimators': 248, 'learning_rate': 0.2539801749589485, 'max_depth': 10, 'min_child_weight': 4, 'gamma': 0.07846900381764957, 'subsample': 0.8293916132116326, 'colsample_bytree': 0.7578800050308805, 'reg_alpha': 0.8378420744910252, 'reg_lambda': 0.8332734755888621}]
🏃 View run Trial_0 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/be77dcad715d435abaa53352290bba9b
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 15:59:51,539] Trial 0 finished with value: 0.9444730813287514 and parameters: {'n_estimators': 248, 'learning_rate': 0.2539801749589485, 'max_depth': 10, 'min_child_weight': 4, 'gamma': 0.07846900381764957, 'subsample': 0.8293916132116326, 'colsample_bytree': 0.7578800050308805, 'reg_alpha': 0.8378420744910252, 'reg_lambda': 0.8332734755888621}. Best is trial 0 with value: 0.9444730813287514.


[2025-04-25 16:00:18,358: INFO: 3253257748: Trial 1: cv_mean_accuracy=0.9319 (+/- 0.0026), params={'n_estimators': 397, 'learning_rate': 0.23837909749084357, 'max_depth': 5, 'min_child_weight': 1, 'gamma': 0.16062737226357746, 'subsample': 0.7710225981587043, 'colsample_bytree': 0.9887389190092412, 'reg_alpha': 0.024932945946752327, 'reg_lambda': 0.45944698550594143}]
🏃 View run Trial_1 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/ecf317a727c24d98b8e0d7a25244c1ef
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:00:19,392] Trial 1 finished with value: 0.9318871706758305 and parameters: {'n_estimators': 397, 'learning_rate': 0.23837909749084357, 'max_depth': 5, 'min_child_weight': 1, 'gamma': 0.16062737226357746, 'subsample': 0.7710225981587043, 'colsample_bytree': 0.9887389190092412, 'reg_alpha': 0.024932945946752327, 'reg_lambda': 0.45944698550594143}. Best is trial 0 with value: 0.9444730813287514.


[2025-04-25 16:00:25,274: INFO: 3253257748: Trial 2: cv_mean_accuracy=0.9349 (+/- 0.0019), params={'n_estimators': 139, 'learning_rate': 0.10601173287165241, 'max_depth': 10, 'min_child_weight': 1, 'gamma': 0.2640750125112365, 'subsample': 0.7210161443027889, 'colsample_bytree': 0.7947776141376546, 'reg_alpha': 0.9683102177637969, 'reg_lambda': 0.5450082795324046}]
🏃 View run Trial_2 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/60322f944d494d468c638baa51db59c9
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:00:26,135] Trial 2 finished with value: 0.9348654066437572 and parameters: {'n_estimators': 139, 'learning_rate': 0.10601173287165241, 'max_depth': 10, 'min_child_weight': 1, 'gamma': 0.2640750125112365, 'subsample': 0.7210161443027889, 'colsample_bytree': 0.7947776141376546, 'reg_alpha': 0.9683102177637969, 'reg_lambda': 0.5450082795324046}. Best is trial 0 with value: 0.9444730813287514.


[2025-04-25 16:00:33,409: INFO: 3253257748: Trial 3: cv_mean_accuracy=0.9452 (+/- 0.0016), params={'n_estimators': 378, 'learning_rate': 0.25229404420160845, 'max_depth': 10, 'min_child_weight': 2, 'gamma': 0.07690507353035689, 'subsample': 0.998873914960003, 'colsample_bytree': 0.9471917148943463, 'reg_alpha': 0.36705792151218763, 'reg_lambda': 0.47540186707119947}]
🏃 View run Trial_3 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/5f9a76d5d03a4c35a232c6e2261c531b
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:00:34,226] Trial 3 finished with value: 0.945189003436426 and parameters: {'n_estimators': 378, 'learning_rate': 0.25229404420160845, 'max_depth': 10, 'min_child_weight': 2, 'gamma': 0.07690507353035689, 'subsample': 0.998873914960003, 'colsample_bytree': 0.9471917148943463, 'reg_alpha': 0.36705792151218763, 'reg_lambda': 0.47540186707119947}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:00:40,578: INFO: 3253257748: Trial 4: cv_mean_accuracy=0.9199 (+/- 0.0019), params={'n_estimators': 216, 'learning_rate': 0.036720848045280266, 'max_depth': 9, 'min_child_weight': 2, 'gamma': 0.18965562012923315, 'subsample': 0.7030797223157366, 'colsample_bytree': 0.9299490091757023, 'reg_alpha': 0.022861105903662304, 'reg_lambda': 0.7667977166921243}]
🏃 View run Trial_4 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/0d5fefcf5f1a4dd88991d6b858e0708d
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:00:41,393] Trial 4 finished with value: 0.9198883161512027 and parameters: {'n_estimators': 216, 'learning_rate': 0.036720848045280266, 'max_depth': 9, 'min_child_weight': 2, 'gamma': 0.18965562012923315, 'subsample': 0.7030797223157366, 'colsample_bytree': 0.9299490091757023, 'reg_alpha': 0.022861105903662304, 'reg_lambda': 0.7667977166921243}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:00:47,026: INFO: 3253257748: Trial 5: cv_mean_accuracy=0.9239 (+/- 0.0034), params={'n_estimators': 249, 'learning_rate': 0.08311372726655901, 'max_depth': 7, 'min_child_weight': 3, 'gamma': 0.2923783829013882, 'subsample': 0.9007838288122968, 'colsample_bytree': 0.9161267183958047, 'reg_alpha': 0.2845225727958941, 'reg_lambda': 0.42482126485769867}]
🏃 View run Trial_5 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/9776d9e79cc34c3791770befb67bbe98
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:00:47,776] Trial 5 finished with value: 0.9238545246277206 and parameters: {'n_estimators': 249, 'learning_rate': 0.08311372726655901, 'max_depth': 7, 'min_child_weight': 3, 'gamma': 0.2923783829013882, 'subsample': 0.9007838288122968, 'colsample_bytree': 0.9161267183958047, 'reg_alpha': 0.2845225727958941, 'reg_lambda': 0.42482126485769867}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:00:56,860: INFO: 3253257748: Trial 6: cv_mean_accuracy=0.9203 (+/- 0.0028), params={'n_estimators': 486, 'learning_rate': 0.030085634502071995, 'max_depth': 8, 'min_child_weight': 7, 'gamma': 0.003014816516820318, 'subsample': 0.875698327374931, 'colsample_bytree': 0.9138334303133131, 'reg_alpha': 0.245145357991975, 'reg_lambda': 0.16975533675788834}]
🏃 View run Trial_6 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/299814319556483fb158c4664e426692
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:00:57,984] Trial 6 finished with value: 0.9203321878579611 and parameters: {'n_estimators': 486, 'learning_rate': 0.030085634502071995, 'max_depth': 8, 'min_child_weight': 7, 'gamma': 0.003014816516820318, 'subsample': 0.875698327374931, 'colsample_bytree': 0.9138334303133131, 'reg_alpha': 0.245145357991975, 'reg_lambda': 0.16975533675788834}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:01:05,047: INFO: 3253257748: Trial 7: cv_mean_accuracy=0.9422 (+/- 0.0018), params={'n_estimators': 281, 'learning_rate': 0.19018490120071282, 'max_depth': 9, 'min_child_weight': 6, 'gamma': 0.19026878637255162, 'subsample': 0.8088655106622599, 'colsample_bytree': 0.7784582461574728, 'reg_alpha': 0.543430548980086, 'reg_lambda': 0.8017865205536976}]
🏃 View run Trial_7 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/329f8ba36e1b43648ef910befed87171
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:01:05,861] Trial 7 finished with value: 0.942225085910653 and parameters: {'n_estimators': 281, 'learning_rate': 0.19018490120071282, 'max_depth': 9, 'min_child_weight': 6, 'gamma': 0.19026878637255162, 'subsample': 0.8088655106622599, 'colsample_bytree': 0.7784582461574728, 'reg_alpha': 0.543430548980086, 'reg_lambda': 0.8017865205536976}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:01:13,654: INFO: 3253257748: Trial 8: cv_mean_accuracy=0.9419 (+/- 0.0011), params={'n_estimators': 336, 'learning_rate': 0.2527827873409818, 'max_depth': 10, 'min_child_weight': 7, 'gamma': 0.2560010331982991, 'subsample': 0.9125237486643982, 'colsample_bytree': 0.8741753416226687, 'reg_alpha': 0.03873694411835471, 'reg_lambda': 0.7614800882309996}]
🏃 View run Trial_8 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/9985dd592855477db0ddebd39aa37825
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:01:14,464] Trial 8 finished with value: 0.9418671248568156 and parameters: {'n_estimators': 336, 'learning_rate': 0.2527827873409818, 'max_depth': 10, 'min_child_weight': 7, 'gamma': 0.2560010331982991, 'subsample': 0.9125237486643982, 'colsample_bytree': 0.8741753416226687, 'reg_alpha': 0.03873694411835471, 'reg_lambda': 0.7614800882309996}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:01:21,844: INFO: 3253257748: Trial 9: cv_mean_accuracy=0.9263 (+/- 0.0031), params={'n_estimators': 426, 'learning_rate': 0.08331645298612107, 'max_depth': 6, 'min_child_weight': 2, 'gamma': 0.297655040234989, 'subsample': 0.9040966446784174, 'colsample_bytree': 0.8626722396587188, 'reg_alpha': 0.7312143447392926, 'reg_lambda': 0.5720630543979316}]
🏃 View run Trial_9 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/6d7c004425664daa85e367fa4f7dfba5
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0


[I 2025-04-25 16:01:22,663] Trial 9 finished with value: 0.9263459335624284 and parameters: {'n_estimators': 426, 'learning_rate': 0.08331645298612107, 'max_depth': 6, 'min_child_weight': 2, 'gamma': 0.297655040234989, 'subsample': 0.9040966446784174, 'colsample_bytree': 0.8626722396587188, 'reg_alpha': 0.7312143447392926, 'reg_lambda': 0.5720630543979316}. Best is trial 3 with value: 0.945189003436426.


[2025-04-25 16:01:22,663: INFO: 3253257748: Best trial found: {'n_estimators': 378, 'learning_rate': 0.25229404420160845, 'max_depth': 10, 'min_child_weight': 2, 'gamma': 0.07690507353035689, 'subsample': 0.998873914960003, 'colsample_bytree': 0.9471917148943463, 'reg_alpha': 0.36705792151218763, 'reg_lambda': 0.47540186707119947} with accuracy 0.9452]


Registered model 'XGBClassifier_CreditFraud_Optuna' already exists. Creating a new version of this model...
2025/04/25 16:02:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBClassifier_CreditFraud_Optuna, version 2


[2025-04-25 16:02:03,113: INFO: 3253257748: Best model logged to MLflow]
[2025-04-25 16:02:03,139: INFO: 3253257748: Best model saved locally at artifacts/model_trainer\model.joblib]


Created version '2' of model 'XGBClassifier_CreditFraud_Optuna'.


🏃 View run Optuna_HPO at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/47d601f413a0451ba4932d0fd348ff7e
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0
