In [1]:
import os

In [2]:
pwd("../")

'f:\\Files\\DSML\\Condition2Cure\\notebook'

In [3]:
os.chdir("../")

In [4]:
pwd("../")

'f:\\Files\\DSML\\Condition2Cure'

In [5]:
from pathlib import Path
from dataclasses import dataclass

In [6]:
from Condition2Cure.utils.helpers import *
from Condition2Cure.constants import *
from Condition2Cure.utils.execptions import *

In [7]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    features_path: Path
    labels_path: Path
    model_path: Path
    max_iter: int
    test_size: float
    random_state: int

In [None]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_training
        params = self.params.vectorizer
        split = self.params.train_test_split

        create_directories([config.root_dir])

        model_training_config =  ModelTrainerConfig(
            root_dir=config.root_dir,
            features_path=config.features_path,
            model_path=config.model_path,
            max_iter=params.max_iter,
            test_size=split.test_size,
            random_state=split.random_state,
            labels_path=config.labels_path
        )

        return model_training_config

In [None]:
import os
import joblib
import mlflow
import optuna
import numpy as np
import dagshub
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from Condition2Cure.utils.helpers import save_json
from Condition2Cure import logger

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


        mlflow.set_tracking_uri("file:./mlruns")
        mlflow.set_experiment("Condition2Cure")
        logger.info("MLflow tracking setup complete.")

    def train(self):
        logger.info("Loading training data...")
        

        X = np.load(self.config.features_path, allow_pickle=True)
        y = np.load(self.config.labels_path, allow_pickle=True)
        
        logger.info(f"Data loaded. Features shape: {X.shape}, Labels shape: {y.shape}")

        def objective(trial):
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "use_label_encoder": False,
                "verbosity": 0,
                "eval_metric": "mlogloss",
                "random_state": self.config.random_state
            }

            model = XGBClassifier(**params)
            scores = cross_val_score(
                model, X, y,
                scoring="accuracy",
                cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=self.config.random_state),
                n_jobs=-1
            )
            return scores.mean()

        logger.info("Running Optuna hyperparameter tuning for XGBoost...")
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=1)

        best_params = study.best_params
        best_score = study.best_value

        logger.info(f"Best hyperparameters: {best_params}")
        logger.info(f"Best CV Score: {best_score}")

        # Train final model with best parameters
        best_model = XGBClassifier(**best_params)
        best_model.fit(X, y)

        # Save model - Fixed: Use the correct path structure
        os.makedirs(self.config.root_dir, exist_ok=True)
        joblib.dump(best_model, self.config.model_path)
        logger.info(f"Model saved to: {self.config.model_path}")

        # Save best model info
        best_model_info = {
            "best_score": float(best_score), 
            "best_params": best_params,
            "feature_shape": list(X.shape),
            "n_classes": len(np.unique(y))
        }
        best_info_path = os.path.join(self.config.root_dir, "best_model_info.json")
        save_json(Path(best_info_path), best_model_info)
        logger.info(f"Best model info saved to: {best_info_path}")

        # Log to MLflow
        with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning"):
            # Log best parameters
            mlflow.log_params(best_params)
            mlflow.log_metric("best_cv_score", best_score)
            mlflow.log_metric("n_features", X.shape[1])
            mlflow.log_metric("n_samples", X.shape[0])
            mlflow.log_metric("n_classes", len(np.unique(y)))
            
            # Log model
            mlflow.sklearn.log_model(
                best_model, 
                artifact_path="model", 
                registered_model_name="Condition2CureModel"
            )
            
            # Log artifacts
            mlflow.log_artifact(best_info_path)
            mlflow.log_artifact(self.config.model_path)
            
            # Add tags
            mlflow.set_tag("model_type", "XGBoost")
            mlflow.set_tag("optimization", "Optuna")
            mlflow.set_tag("stage", "training")
            
            logger.info("Model and metrics logged to MLflow.")

        logger.info(f"Model training complete. Best CV score: {best_score:.4f}")
        return best_model_info

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise CustomException(str(e), sys) 

[2025-06-21 17:00:39,440: INFO: helpers: yaml file: config\config.yaml loaded successfully]
[2025-06-21 17:00:39,448: INFO: helpers: yaml file: config\params.yaml loaded successfully]
[2025-06-21 17:00:39,456: INFO: helpers: yaml file: config\schema.yaml loaded successfully]
[2025-06-21 17:00:39,461: INFO: helpers: created directory at: artifacts]
[2025-06-21 17:00:39,463: INFO: helpers: created directory at: artifacts/model_training]
[2025-06-21 17:00:39,486: INFO: 2300520557: MLflow tracking setup complete.]
[2025-06-21 17:00:39,486: INFO: 2300520557: Loading training data...]
Type of loaded data: <class 'numpy.ndarray'>
Content of loaded data: [[ 2.32058617e-01 -5.00949067e-02 -3.14517185e-02 ... -3.91988235e-05
   5.22123972e-02 -9.53827493e-03]
 [ 2.27359324e-01 -1.48550511e-01  3.91642716e-02 ... -1.71931963e-02
  -6.71735297e-03 -3.43215973e-02]
 [ 2.16979821e-01 -3.62951248e-02 -4.16611902e-02 ... -3.58333153e-02
   1.23667168e-02  2.56615089e-02]
 ...
 [ 2.13472029e-01  1.3356

[I 2025-06-21 17:00:39,953] A new study created in memory with name: no-name-b58e6d14-e2b5-4cd9-b2fd-3cd9f660fa96
[W 2025-06-21 17:01:17,574] Trial 0 failed with parameters: {'n_estimators': 133, 'max_depth': 9, 'learning_rate': 0.10241297450742867, 'subsample': 0.9112977896140222, 'colsample_bytree': 0.8809129887894374} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Javith Naseem\.conda\envs\Condition2Cure\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Javith Naseem\AppData\Local\Temp\ipykernel_16640\2300520557.py", line 53, in objective
    scores = cross_val_score(
  File "c:\Users\Javith Naseem\.conda\envs\Condition2Cure\lib\site-packages\sklearn\utils\_param_validation.py", line 218, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\Javith Naseem\.conda\envs\Condition2Cure\lib\site-packages\sklearn\model_selection\_validation.py", line 677,

KeyboardInterrupt: 