In [1]:
import os

%pwd 

'd:\\pythonProjects\\SurgeSense\\research'

In [2]:
os.chdir('../')
%pwd

'd:\\pythonProjects\\SurgeSense'

In [3]:
# entity 
import os 
from pathlib import Path 
from dataclasses import dataclass

@dataclass
class ModelTrainConfig:
    root_dir: Path 
    train_data_path: Path 
    test_data_path: Path 
    model_name: str 
    n_estimators: int 
    max_depth: int 
    min_samples_split: int 
    learning_rate: int
    select_model: str
    target_column: str 
    categorical_columns:list
    numerical_columns:list
    drop_columns:list 



In [4]:
# config 
from SurgeSense.constants import * 
from SurgeSense.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            param_filepath=PARAMS_FILE_PATH,
            schema_filepath=SCHEMA_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.param=read_yaml(param_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_train_config(self)->ModelTrainConfig:
        config=self.config.model_trainer
        params=self.param.select_model 
        schema=self.schema
        create_directories([config.root_dir])

        model_train_config=ModelTrainConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            n_estimators=params.n_estimators,
            max_depth=params.max_depth,
            min_samples_split=params.min_samples_split,
            learning_rate=params.learning_rate,
            select_model=params.algo,
            target_column=schema.TARGET_COLUMN.name,
            categorical_columns=schema.TRANSFORM.CATEGORICAL_DATA,
            numerical_columns=schema.TRANSFORM.NUMERICAL_DATA,
            drop_columns=schema.DROP_COLUMNS


        )

        return model_train_config
        


In [5]:
# component 
import pandas as pd 
import os 
from SurgeSense import logger
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import joblib
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd 
import numpy as np 



class ModelTrainer:
    def __init__(self, config: ModelTrainConfig):
        self.config=config

    def create_pipeline(self):
        
        categorical_columns=self.config.categorical_columns
        numerical_columns=self.config.numerical_columns

        numerical_preprocessor=Pipeline(
            steps=[
                ('imputation_menu',SimpleImputer(missing_values=np.nan,strategy='median')),
                ('scalar',StandardScaler())
            ]
        )

        categorical_preprocessor=Pipeline(
            steps=[
                ('imputation_constant',SimpleImputer(strategy='most_frequent')),
                ('encode',OneHotEncoder(handle_unknown='ignore'))
            ]
        )

        preprocessor=ColumnTransformer(
            transformers=[
                ('categorical_columns',categorical_preprocessor,categorical_columns),
                ('numerical_columns',numerical_preprocessor,numerical_columns)
            ]
        )

        if self.config.select_model=='XGBoostRegressor':
            pipe=Pipeline(
                steps=[
                    ('preprocessor',preprocessor),
                    ('model', XGBRegressor(
                        n_estimators=self.config.n_estimators,
                        learning_rate= self.config.learning_rate,
                        max_depth=self.config.max_depth
                    ))
                ]
            )
        elif self.config.select_model=='GRADIENT_BOOSTING':
            pipe=Pipeline(
                steps=[
                    ('preprocessor',preprocessor),
                    ('model', GradientBoostingRegressor(
                        n_estimators=self.config.n_estimators,
                        learning_rate= self.config.learning_rate,
                        max_depth=self.config.max_depth
                    ))
                ]
            )
        elif self.config.select_model=='RANDOM_FOREST':
            pipe=Pipeline(
                steps=[
                    ('preprocessor',preprocessor),
                    ('model', RandomForestRegressor(
                        n_estimators=self.config.n_estimators,
                        learning_rate= self.config.learning_rate,
                        max_depth=self.config.max_depth
                    ))
                ]
            )

        return pipe

    def train(self, pipe: Pipeline):
        train_data=pd.read_csv(self.config.train_data_path)
        test_data=pd.read_csv(self.config.test_data_path)

        train_x=train_data.drop([self.config.target_column]+self.config.drop_columns,axis=1)
        test_x=test_data.drop([self.config.target_column]+self.config.drop_columns,axis=1)
        train_y=train_data[[self.config.target_column]]
        test_y=test_data[[self.config.target_column]]
        # print(test_x.columns)
        pipe.fit(train_x,train_y)
        joblib.dump(pipe,os.path.join(self.config.root_dir,self.config.model_name))


In [6]:
# pipeline 
try:
    config=ConfigurationManager()
    model_trainer_config=config.get_model_train_config()
    model_trainer=ModelTrainer(config=model_trainer_config)
    pipeline=model_trainer.create_pipeline()
    model_trainer.train(pipeline)
except Exception as e:
    raise e 

[2025-03-30 18:22:04,301: INFO :common : yaml file: config\config.yaml loaded successfully]
[2025-03-30 18:22:04,306: INFO :common : yaml file: params.yaml loaded successfully]
[2025-03-30 18:22:04,311: INFO :common : yaml file: schema.yaml loaded successfully]
[2025-03-30 18:22:04,313: INFO :common : created directory at: artifacts]
[2025-03-30 18:22:04,314: INFO :common : created directory at: artifacts/model_trainer]


In [7]:
config=ConfigurationManager()
model_trainer_config=config.get_model_train_config()

[2025-03-30 18:22:28,153: INFO :common : yaml file: config\config.yaml loaded successfully]
[2025-03-30 18:22:28,159: INFO :common : yaml file: params.yaml loaded successfully]
[2025-03-30 18:22:28,164: INFO :common : yaml file: schema.yaml loaded successfully]
[2025-03-30 18:22:28,165: INFO :common : created directory at: artifacts]
[2025-03-30 18:22:28,166: INFO :common : created directory at: artifacts/model_trainer]


In [8]:
# data=pd.read_csv(params.model_trainer.train_data_path)
# data['price']