In [1]:
import os

In [2]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection\\Exp'

In [3]:
os.chdir("../")

In [4]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_preprocess: Path
    test_preprocess: Path
    model_name: str
    target_column: str
    random_search_params: dict
    n_iter: int     
    cv_folds: int
    scoring: str 
    n_jobs: int

In [7]:
from project.constants import *
from project.utils.common import *

In [8]:
class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_PATH,
                schema_filepath=SCHEMA_PATH,
                params_filepath=PARAMS_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBClassifier
        schema = self.schema
        random_search_params = params.random_search
        cv_params = params.cross_validation

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_preprocess=config.train_preprocess,
            test_preprocess=config.test_preprocess,
            model_name=config.model_name,
            target_column=schema.target_column.name,
            random_search_params=random_search_params, 
            cv_folds=cv_params.cv_folds,            
            scoring=cv_params.scoring,             
            n_jobs=cv_params.n_jobs,
            n_iter=cv_params.n_iter          
        )
        
        return model_trainer_config 

In [9]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [10]:
import os
import numpy as np
import joblib
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from project import logger
import mlflow
import mlflow.xgboost
from project.entity.config_entity import ModelTrainerConfig


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Set MLflow tracking URI
        mlflow.set_tracking_uri("http://127.0.0.1:5000/")
        print("Tracking URI:", mlflow.get_tracking_uri())

        # Validate file paths
        if not os.path.exists(self.config.train_preprocess):
            logger.error(f"Train preprocessed file not found at: {self.config.train_preprocess}")
            raise FileNotFoundError("Train preprocessed file not found")
        if not os.path.exists(self.config.test_preprocess):
            logger.error(f"Test preprocessed file not found at: {self.config.test_preprocess}")
            raise FileNotFoundError("Test preprocessed file not found")

        # Load preprocessed data
        train_data = np.load(self.config.train_preprocess, allow_pickle=True)
        test_data = np.load(self.config.test_preprocess, allow_pickle=True)

        logger.info(f'Loaded train and test data')
        logger.info(f'Train data shape: {train_data.shape}')
        logger.info(f'Test data shape: {test_data.shape}')

        train_x = train_data[:, :-1]
        train_y = train_data[:, -1]
        test_x = test_data[:, :-1]
        test_y = test_data[:, -1]

        logger.info(f"Training data shape: X={train_x.shape}, y={train_y.shape}")
        logger.info(f"Testing data shape: X={test_x.shape}, y={test_y.shape}")

        # Initialize MLflow experiment
        mlflow.set_experiment("Credit-Fraud-Detection")
        mlflow.xgboost.autolog()  
        with mlflow.start_run(run_name="RandomizedSearchCV_Tuning"):
            mlflow.set_tag("run_type", "hyperparameter_tuning")
            mlflow.set_tag("model", "XGBClassifier")

            logger.info('Initializing Randomized Search')

            xgb_model = XGBClassifier(
                objective='binary:logistic',
                verbosity=0,
                eval_metric='logloss'
            )

            param_dist = self.config.random_search_params

            logger.info('>>>>>>>>>> ......Performing Randomized Search - this may take some time...... <<<<<<<<<')


            random_search = RandomizedSearchCV(
                estimator=xgb_model,
                param_distributions=param_dist,
                n_iter=self.config.n_iter,
                cv=self.config.cv_folds,
                scoring='accuracy',
                verbose=1,
                n_jobs=self.config.n_jobs,
                return_train_score=True
            )
            random_search.fit(train_x, train_y)

            for i, (params, mean_score, std_score) in enumerate(
                zip(
                    random_search.cv_results_["params"],
                    random_search.cv_results_["mean_test_score"],
                    random_search.cv_results_["std_test_score"]
                )
            ):
                with mlflow.start_run(nested=True, run_name=f"Trial_{i+1}"):
                    mlflow.set_tag("trial_number", i + 1)
                    mlflow.log_params(params)
                    mlflow.log_metric("mean_accuracy", mean_score)
                    mlflow.log_metric("std_accuracy", std_score)  
                    logger.info(f"Trial {i+1}: params={params}, mean_accuracy={mean_score:.4f}, std_accuracy={std_score:.4f}")

            best_params = random_search.best_params_
            best_score = random_search.best_score_
            mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
            mlflow.log_metric("best_accuracy", best_score)  
            logger.info(f"Best parameters: {best_params}")
            logger.info(f"Best accuracy: {best_score:.4f}")

            best_model = random_search.best_estimator_
            mlflow.xgboost.log_model(
                xgb_model=best_model,
                artifact_path="xgboost_model",
                registered_model_name="XGBClassifier_CreditFraud"
            )
            logger.info("Best model logged to MLflow")

            model_path = os.path.join(self.config.root_dir, self.config.model_name)
            joblib.dump(random_search, model_path)
            logger.info(f'Model saved locally at {model_path}')

In [11]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_training_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
    
except FileNotFoundError as e:
    logger.error(f'File Not Found: {e}')
except KeyError as e:
    logger.error(f'Key Error: {e}')
except AttributeError as e:
    logger.error(f'Attribute Error: {e}')
except Exception as e:
    logger.error(f'Unexpected error: {e}')

[2025-04-13 18:19:48,546: INFO: common: yaml file: yaml file\config.yaml loaded successfully]
[2025-04-13 18:19:48,554: INFO: common: yaml file: yaml file\schema.yaml loaded successfully]
[2025-04-13 18:19:48,585: INFO: common: yaml file: yaml file\params.yaml loaded successfully]
[2025-04-13 18:19:48,591: INFO: common: created directory at: artifacts]
[2025-04-13 18:19:48,598: INFO: common: created directory at: artifacts/model_trainer]
Tracking URI: http://127.0.0.1:5000/
[2025-04-13 18:19:48,615: INFO: 188903384: Loaded train and test data]
[2025-04-13 18:19:48,620: INFO: 188903384: Train data shape: (69834, 10)]
[2025-04-13 18:19:48,625: INFO: 188903384: Test data shape: (17459, 10)]
[2025-04-13 18:19:48,630: INFO: 188903384: Training data shape: X=(69834, 9), y=(69834,)]
[2025-04-13 18:19:48,631: INFO: 188903384: Testing data shape: X=(17459, 9), y=(17459,)]


2025/04/13 18:19:48 INFO mlflow.tracking.fluent: Experiment with name 'Credit-Fraud-Detection' does not exist. Creating a new experiment.


[2025-04-13 18:19:49,416: INFO: 188903384: Initializing Randomized Search]
[2025-04-13 18:19:49,418: INFO: 188903384: >>>>>>>>>> ......Performing Randomized Search - this may take some time...... <<<<<<<<<]
Fitting 5 folds for each of 100 candidates, totalling 500 fits


4 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\xgboost\sklearn.py", line 1580, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\xgboost\sklearn.py", line 603,

[2025-04-13 18:24:45,772: INFO: 188903384: Trial 1: params={'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 7, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 1.0}, mean_accuracy=nan, std_accuracy=nan]
🏃 View run Trial_1 at: http://127.0.0.1:5000/#/experiments/169766163993984911/runs/edd93ba72b71437fb731e2ef7e4d75d6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169766163993984911
[2025-04-13 18:24:45,956: INFO: 188903384: Trial 2: params={'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 0.9}, mean_accuracy=nan, std_accuracy=nan]
🏃 View run Trial_2 at: http://127.0.0.1:5000/#/experiments/169766163993984911/runs/21fd06c8f01442aa9098ead6e8344bef
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169766163993984911
[2025-04-13 18:24:46,123: INFO: 188903384: Trial 3: params={'subsam

Successfully registered model 'XGBClassifier_CreditFraud'.
2025/04/13 18:25:09 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBClassifier_CreditFraud, version 1


[2025-04-13 18:25:09,866: INFO: 188903384: Best model logged to MLflow]
[2025-04-13 18:25:09,914: INFO: 188903384: Model saved locally at artifacts/model_trainer\model.joblib]


Created version '1' of model 'XGBClassifier_CreditFraud'.


🏃 View run RandomizedSearchCV_Tuning at: http://127.0.0.1:5000/#/experiments/169766163993984911/runs/6e766d28e1af4feb8a2fdc4a95127db1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/169766163993984911
