In [1]:
import os
%pwd

os.chdir("../")
%pwd

'/workspaces/MLOps_data_science_pipeline'

In [12]:
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict


@dataclass
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    alpha: float
    l1_ratio: float
    random_state: int
    target_col: str
    models: Dict[str, Dict[str, Any]] = field(default_factory=dict)

In [14]:
# --- helper to safely convert Box/ConfigBox/dataclass/dict to plain dict ---
def _to_plain_dict(obj) -> Dict[str, Any]:
    """
    Convert many common config-like objects to plain dict.
    Works for dict, Box/ConfigBox (to_dict), dataclass (vars), or objects with attributes.
    """
    if obj is None:
        return {}
    # already a dict
    if isinstance(obj, dict):
        return obj
    # Box/ConfigBox typically has to_dict()
    if hasattr(obj, "to_dict"):
        try:
            return obj.to_dict()
        except Exception:
            pass
    # mapping-like
    if hasattr(obj, "items"):
        try:
            return dict(obj)
        except Exception:
            pass
    # dataclass or plain object
    try:
        return dict(vars(obj))
    except Exception:
        pass
    # fallback: introspect public attributes
    out = {}
    for k in dir(obj):
        if k.startswith("_"):
            continue
        try:
            v = getattr(obj, k)
        except Exception:
            continue
        if callable(v):
            continue
        out[k] = v
    return out

In [17]:
from src.data_science.constants.constants import *
from src.data_science.utils.common import read_yaml, create_directories
from src.data_science.utils.logger import logger

In [18]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    # def get_model_trainer_config(self) -> ModelTrainerConfig:
    #     config = self.config.model_trainer
    #     params = self.params.ElasticNet
    #     schema = self.schema.TARGET_COLUMN

    #     create_directories([config.root_dir])

    #     model_trainer_config = ModelTrainerConfig(
    #         root_dir=config.root_dir,
    #         train_data_path=config.train_data_path,
    #         test_data_path=config.test_data_path,
    #         model_name=config.model_name,
    #         alpha=params.alpha,
    #         l1_ratio=params.l1_ratio,
    #         random_state=params.random_state,
    #         target_col=schema.name,
    #     )

    #     return model_trainer_config

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        trainer_block = getattr(self.config, "model_trainer", None)
        if trainer_block is None:
            raise KeyError("model_trainer block not found in config file")

        # ensure output dir exists
        create_directories([trainer_block.root_dir])

        # Convert params/config representations to plain dict (use your existing _to_plain_dict helper)
        params_dict = _to_plain_dict(self.params)

        # Prefer models: key (case-insensitive)
        models_conf = {}
        for k in params_dict.keys():
            if k.lower() == "models":
                models_conf = _to_plain_dict(params_dict[k])
                break

        if not models_conf:
            # fallback: look for known top-level model blocks (ElasticNet, RandomForest, etc.)
            known_models = ["ElasticNet", "LinearRegression", "Ridge", "Lasso", "RandomForest", "GradientBoosting"]
            lower_to_key = {k.lower(): k for k in params_dict.keys()}
            for model in known_models:
                if model.lower() in lower_to_key:
                    original_key = lower_to_key[model.lower()]
                    models_conf[model] = _to_plain_dict(params_dict[original_key])

        # normalize models_conf -> ensure keys are strings and values are plain dicts
        normalized_models = {}
        for mname, mp in models_conf.items():
            if mp is None:
                normalized_models[str(mname)] = {}
            else:
                normalized_models[str(mname)] = _to_plain_dict(mp)

        # Fill legacy ElasticNet parameters if available (for backward compatibility)
        elastic_params = normalized_models.get("ElasticNet", {})
        alpha = elastic_params.get("alpha", None)
        l1_ratio = elastic_params.get("l1_ratio", None)
        random_state = elastic_params.get("random_state", None)

        # Resolve target column name from schema (reuse your logic)
        schema_block = getattr(self.schema, "TARGET_COLUMN", None) or getattr(self.schema, "target_column", None)
        if hasattr(schema_block, "name"):
            target_col_name = schema_block.name
        elif isinstance(schema_block, str):
            target_col_name = schema_block
        else:
            schema_plain = _to_plain_dict(self.schema)
            target_col_name = schema_plain.get("TARGET_COLUMN") or schema_plain.get("target_column") or "target"

        model_trainer_config = ModelTrainerConfig(
            root_dir=trainer_block.root_dir,
            train_data_path=trainer_block.train_data_path,
            test_data_path=trainer_block.test_data_path,
            model_name=trainer_block.model_name,
            target_col=target_col_name,
            alpha=alpha,
            l1_ratio=l1_ratio,
            random_state=random_state,
            models=normalized_models,
        )

        # Optional: quick sanity print/log to verify what was parsed
        logger.info(f"ModelTrainerConfig created. ElasticNet params: alpha={alpha}, l1_ratio={l1_ratio}, random_state={random_state}")
        logger.info(f"Models keys found: {list(normalized_models.keys())}")

        return model_trainer_config


In [5]:
import pandas as pd
import os
from src.data_science.utils.logger import logger
import joblib

from sklearn.linear_model import ElasticNet, LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor



In [21]:
# class ModelTrainer:
#     def __init__(self, config: ModelTrainerConfig):
#         self.config = config

#     def train_model(self, X_train, y_train):
#         model = ElasticNet(
#             alpha=self.config.alpha,
#             l1_ratio=self.config.l1_ratio,
#             random_state=self.config.random_state
#         )
#         model.fit(X_train, y_train)
#         return model

#     def save_model(self, model):
#         model_path = os.path.join(self.config.root_dir, self.config.model_name)
#         joblib.dump(model, model_path)
#         logger.info(f"Model saved at {model_path}")

#     def initiate_model_trainer(self):
#         try:
#             train_df = pd.read_csv(self.config.train_data_path)
#             test_df = pd.read_csv(self.config.test_data_path)

#             X_train = train_df.drop(columns=[self.config.target_col])
#             y_train = train_df[self.config.target_col]

#             X_test = test_df.drop(columns=[self.config.target_col])
#             y_test = test_df[self.config.target_col]

#             model = self.train_model(X_train, y_train)
#             self.save_model(model)

#             logger.info("Model training completed successfully.")
#         except Exception as e:
#             logger.error(f"Error in model training: {e}")
#             raise e


# Map model names to sklearn classes
MODEL_MAPPING = {
    "ElasticNet": ElasticNet,
    "LinearRegression": LinearRegression,
    "Ridge": Ridge,
    "Lasso": Lasso,
    "RandomForest": RandomForestRegressor,
    "GradientBoosting": GradientBoostingRegressor
}

class ModelTrainer:
    def __init__(self, config):
        """
        config is your ModelTrainerConfig (could be a dataclass, Box/ConfigBox, or dict-like).
        It should contain:
          - train_data_path, test_data_path, target_col, root_dir, model_name (optional)
        And model hyperparams either:
          a) under config.models (recommended) as a mapping {ModelName: {params...}}
          b) OR as top-level keys matching model names (ElasticNet:, RandomForest:, etc.)
        """
        self.config = config
        self.models = {}

        # 1) Try recommended structure: config.models
        models_conf = None
        if hasattr(self.config, "models"):
            models_conf = getattr(self.config, "models")
        else:
            # 2) Fall back: collect any top-level keys that match MODEL_MAPPING
            #    This handles configs where model blocks are at top-level
            possible = {}
            for model_name in MODEL_MAPPING.keys():
                if hasattr(self.config, model_name):
                    possible[model_name] = getattr(self.config, model_name)
            if possible:
                models_conf = possible

        # Normalize models_conf into a plain dict of dicts
        models_dict = {}
        if models_conf is not None:
            # If it's already a dict-like, try to convert
            try:
                # covers Box, ConfigBox, dict, etc.
                for k, v in dict(models_conf).items():
                    # convert v to plain dict if it is mapping-like, else empty dict
                    if v is None:
                        params = {}
                    else:
                        try:
                            params = dict(v) if hasattr(v, "items") else {}
                        except Exception:
                            params = {}
                    models_dict[str(k)] = params
            except Exception:
                logger.warning("Could not convert config.models to dict; no models will be loaded.")
        else:
            logger.warning("No model configuration found in config (no 'models' key and no top-level model keys).")

        # Instantiate models from MODEL_MAPPING using params
        for model_name, params in models_dict.items():
            if model_name in MODEL_MAPPING:
                try:
                    logger.info(f"Initializing {model_name} with params: {params}")
                    # ensure params is a plain dict
                    if params is None:
                        params = {}
                    self.models[model_name] = MODEL_MAPPING[model_name](**params)
                except TypeError as te:
                    logger.error(f"Failed to instantiate {model_name} with params {params}: {te}")
            else:
                logger.warning(f"{model_name} not recognized in MODEL_MAPPING — skipping.")

        if not self.models:
            logger.error("No models initialized. Check your config 'models' section or top-level model keys.")
            # Do not raise here; you might prefer to raise depending on workflow:
            # raise RuntimeError("No models initialized from config")

    def train_models(self, X_train, y_train):
        trained_models = {}
        for name, model in self.models.items():
            logger.info(f"Training {name}...")
            model.fit(X_train, y_train)
            trained_models[name] = model
        return trained_models

    def save_models(self, models):
        os.makedirs(getattr(self.config, "root_dir", "."), exist_ok=True)

        # get configured model_name and remove any extension if present
        raw_model_name = getattr(self.config, "model_name", "model") or "model"
        base_name, _ = os.path.splitext(raw_model_name) 
        if not base_name:
            base_name = "model"

        for name, model in models.items():
            filename = f"{name}_{base_name}.joblib"
            model_path = os.path.join(self.config.root_dir, filename)
            joblib.dump(model, model_path)
            logger.info(f"{name} saved at {model_path}")

    def initiate_model_trainer(self):
        try:
            train_df = pd.read_csv(self.config.train_data_path)
            # test_df may be unused, but keep for future work
            # test_df = pd.read_csv(self.config.test_data_path) if getattr(self.config, "test_data_path", None) else None

            X_train = train_df.drop(columns=[self.config.target_col])
            y_train = train_df[self.config.target_col]

            models = self.train_models(X_train, y_train)
            self.save_models(models)

            logger.info("All models trained and saved successfully.")
        except Exception as e:
            logger.error(f"Error in model training: {e}")
            raise e


In [22]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.initiate_model_trainer()
except Exception as e:
    raise e

[2025-09-25 08:59:16,294 - INFO - common - yaml file: config/config.yaml loaded successfully!]
[2025-09-25 08:59:16,298 - INFO - common - yaml file: params.yaml loaded successfully!]
[2025-09-25 08:59:16,301 - INFO - common - yaml file: schema.yaml loaded successfully!]
[2025-09-25 08:59:16,303 - INFO - common - created directory at: artifacts]
[2025-09-25 08:59:16,307 - INFO - common - created directory at: artifacts/model_trainer]
[2025-09-25 08:59:16,308 - INFO - 2548596474 - ModelTrainerConfig created. ElasticNet params: alpha=0.2, l1_ratio=0.1, random_state=42]
[2025-09-25 08:59:16,309 - INFO - 2548596474 - Models keys found: ['ElasticNet', 'LinearRegression', 'Ridge', 'Lasso', 'RandomForest', 'GradientBoosting']]
[2025-09-25 08:59:16,310 - INFO - 1866421887 - Initializing ElasticNet with params: {'alpha': 0.2, 'l1_ratio': 0.1, 'random_state': 42}]
[2025-09-25 08:59:16,311 - INFO - 1866421887 - Initializing LinearRegression with params: {}]
[2025-09-25 08:59:16,311 - INFO - 186642