In [1]:
import os

In [2]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection\\Exp'

In [3]:
os.chdir("../")

In [4]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_preprocess: Path
    test_preprocess: Path
    model_name: str
    target_column: str
    random_search_params: dict
    n_iter: int     
    cv_folds: int
    scoring: str 
    n_jobs: int

In [7]:
from project.constants import *
from project.utils.common import *

In [8]:
import os
import joblib   
import numpy as np
import mlflow
import mlflow.xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import dagshub
from project import logger
from project.entity.config_entity import ModelTrainerConfig

class ConfigurationManager:
    def __init__(self,
                config_filepath=CONFIG_PATH,
                schema_filepath=SCHEMA_PATH,
                params_filepath=PARAMS_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBClassifier
        schema = self.schema
        random_search_params = params.random_search
        cv_params = params.cross_validation

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_preprocess=config.train_preprocess,
            test_preprocess=config.test_preprocess,
            model_name=config.model_name,
            target_column=schema.target_column.name,
            random_search_params=random_search_params, 
            cv_folds=cv_params.cv_folds,            
            scoring=cv_params.scoring,             
            n_jobs=cv_params.n_jobs,
            n_iter=cv_params.n_iter          
        )
        
        return model_trainer_config 

In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        dagshub.init(repo_owner="JavithNaseem-J", repo_name="E2E-Credit-Fraud-Detection")
        mlflow.set_tracking_uri("https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow")
        mlflow.set_experiment("E2E-Credit-Fraud-Detection")

    def train(self):
        

        # Validate file paths
        if not os.path.exists(self.config.train_preprocess):
            logger.error(f"Train preprocessed file not found at: {self.config.train_preprocess}")
            raise FileNotFoundError("Train preprocessed file not found")
        if not os.path.exists(self.config.test_preprocess):
            logger.error(f"Test preprocessed file not found at: {self.config.test_preprocess}")
            raise FileNotFoundError("Test preprocessed file not found")

        # Load preprocessed data
        train_data = np.load(self.config.train_preprocess, allow_pickle=True)
        test_data = np.load(self.config.test_preprocess, allow_pickle=True)

        logger.info(f'Loaded train and test data')
        logger.info(f'Train data shape: {train_data.shape}')
        logger.info(f'Test data shape: {test_data.shape}')

        train_x = train_data[:, :-1]
        train_y = train_data[:, -1]
        test_x = test_data[:, :-1]
        test_y = test_data[:, -1]

    
        mlflow.xgboost.autolog()  
        with mlflow.start_run(run_name="RandomizedSearchCV_Tuning"):
            mlflow.set_tag("run_type", "hyperparameter_tuning")
            mlflow.set_tag("model", "XGBClassifier")

            logger.info('Initializing Randomized Search')

            xgb_model = XGBClassifier(
                objective='binary:logistic',
                verbosity=0,
                eval_metric='logloss'
            )

            param_dist = self.config.random_search_params

            logger.info('>>>>>>>>>> ......Performing Randomized Search - this may take some time...... <<<<<<<<<')


            random_search = RandomizedSearchCV(
                estimator=xgb_model,
                param_distributions=param_dist,
                n_iter=self.config.n_iter,
                cv=self.config.cv_folds,
                scoring='accuracy',
                verbose=1,
                n_jobs=self.config.n_jobs,
                return_train_score=True
            )
            random_search.fit(train_x, train_y)

            for i, (params, mean_score, std_score) in enumerate(
                zip(
                    random_search.cv_results_["params"],
                    random_search.cv_results_["mean_test_score"],
                    random_search.cv_results_["std_test_score"]
                )
            ):
                with mlflow.start_run(nested=True, run_name=f"Trial_{i+1}"):
                    mlflow.set_tag("trial_number", i + 1)
                    mlflow.log_params(params)
                    mlflow.log_metric("mean_accuracy", mean_score)
                    mlflow.log_metric("std_accuracy", std_score)  
                    logger.info(f"Trial {i+1}: params={params}, mean_accuracy={mean_score:.4f}, std_accuracy={std_score:.4f}")


            best_model = random_search.best_estimator_
            mlflow.xgboost.log_model(
                xgb_model=best_model,
                artifact_path="xgboost_model",
                registered_model_name="XGBClassifier_CreditFraud"
            )
            logger.info("Best model logged to MLflow")

            model_path = os.path.join(self.config.root_dir, self.config.model_name)
            joblib.dump(random_search, model_path)
            logger.info(f'Model saved locally at {model_path}')

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_training_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
    
except FileNotFoundError as e:
    logger.error(f'File Not Found: {e}')
except KeyError as e:
    logger.error(f'Key Error: {e}')
except AttributeError as e:
    logger.error(f'Attribute Error: {e}')
except Exception as e:
    logger.error(f'Unexpected error: {e}')

[2025-04-21 17:44:34,890: INFO: common: yaml file: yaml file\config.yaml loaded successfully]
[2025-04-21 17:44:34,893: INFO: common: yaml file: yaml file\schema.yaml loaded successfully]
[2025-04-21 17:44:34,902: INFO: common: yaml file: yaml file\params.yaml loaded successfully]
[2025-04-21 17:44:34,905: INFO: common: created directory at: artifacts]
[2025-04-21 17:44:34,906: INFO: common: created directory at: artifacts/model_trainer]
[2025-04-21 17:44:35,313: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-04-21 17:44:35,328: INFO: helpers: Accessing as JavithNaseem-J]
[2025-04-21 17:44:35,681: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/JavithNaseem-J/E2E-Credit-Fraud-Detection "HTTP/1.1 200 OK"]
[2025-04-21 17:44:36,005: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-04-21 17:44:36,012: INFO: helpers: Initialized MLflow to track repo "JavithNaseem-J/E2E-Credit-Fraud-Detection"]


[2025-04-21 17:44:36,017: INFO: helpers: Repository JavithNaseem-J/E2E-Credit-Fraud-Detection initialized!]
[2025-04-21 17:44:36,480: INFO: 744370784: Loaded train and test data]
[2025-04-21 17:44:36,481: INFO: 744370784: Train data shape: (69840, 10)]
[2025-04-21 17:44:36,482: INFO: 744370784: Test data shape: (17460, 10)]
[2025-04-21 17:44:37,801: INFO: 744370784: Initializing Randomized Search]
[2025-04-21 17:44:37,801: INFO: 744370784: >>>>>>>>>> ......Performing Randomized Search - this may take some time...... <<<<<<<<<]
Fitting 5 folds for each of 10 candidates, totalling 50 fits


1 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\xgboost\sklearn.py", line 1580, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "f:\ProgramFiles\anaconda3\envs\credit-fraud\lib\site-packages\xgboost\sklearn.py", line 603, 

[2025-04-21 17:45:51,903: INFO: 744370784: Trial 1: params={'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 0.5, 'n_estimators': 50, 'min_child_weight': 7, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 0.9}, mean_accuracy=nan, std_accuracy=nan]
🏃 View run Trial_1 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/f04beb8d857d489f9b0566e4af96ea59
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0
[2025-04-21 17:45:54,313: INFO: 744370784: Trial 2: params={'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.7}, mean_accuracy=0.9343, std_accuracy=0.0039]
🏃 View run Trial_2 at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/d3b14ba8b4c640969a72cdc03a005ec9
🧪 View experiment at: https://dagshub.com/JavithNase

Registered model 'XGBClassifier_CreditFraud' already exists. Creating a new version of this model...
2025/04/21 17:47:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBClassifier_CreditFraud, version 4


[2025-04-21 17:47:02,457: INFO: 744370784: Best model logged to MLflow]
[2025-04-21 17:47:02,488: INFO: 744370784: Model saved locally at artifacts/model_trainer\model.joblib]


Created version '4' of model 'XGBClassifier_CreditFraud'.


🏃 View run RandomizedSearchCV_Tuning at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0/runs/10826f683c964054b9c232ab7a461e02
🧪 View experiment at: https://dagshub.com/JavithNaseem-J/E2E-Credit-Fraud-Detection.mlflow/#/experiments/0
