In [1]:
import os

In [2]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Regression-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Regression-Project'

In [5]:
from box import ConfigBox
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    model_name: str
    trained_model_file_path: Path
    grid_search_evaluation_result: Path
    params: ConfigBox



In [6]:
from RegressionProject.utils.common import read_yaml, create_directories, load_best_model_from_json, save_object_pkl
from RegressionProject.constants import *


In [7]:
from RegressionProject.constants import *


class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH,
            schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            model_name=config.model_name,
            trained_model_file_path=config.trained_model_file_path,
            grid_search_evaluation_result=config.grid_search_evaluation_result,
            params=params,

        )

        return model_trainer_config

In [13]:
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from RegressionProject.logging import logger

from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from RegressionProject.constants import *
from RegressionProject.utils.common import read_yaml, create_directories
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import json
import pandas as pd


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def prepare_params_models_gs(self):
        params = self.config.params

        model_params = {
            'Decision Tree': (DecisionTreeRegressor(), params.Decision_Tree.to_dict()),
            'Random Forest': (RandomForestRegressor(), params.Random_Forest.to_dict()),
            'Gradient Boosting': (GradientBoostingRegressor(), params.Gradient_Boosting.to_dict()),
            'Linear Regression': (LinearRegression(), {}),
            'XGBRegressor': (XGBRegressor(), params.XGBRegressor.to_dict()),
            'CatBoosting Regressor': (CatBoostRegressor(verbose=False), params.CatBoosting_Regressor.to_dict()),
            'AdaBoost Regressor': (AdaBoostRegressor(), params.AdaBoost_Regressor.to_dict()),
        }

        return model_params

    def perform_grid_search(self, model, param_grid, X, y):
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X, y)
        logger.info(f"Best params for {model.__class__.__name__}: {grid_search.best_params_}")
        logger.info(f"Best score for {model.__class__.__name__}: {grid_search.best_score_}")
        return grid_search

    def evaluate_models(self, x_train, y_train, x_test, y_test, model_params):
        results = {}
        for model_name, (model, param_grid) in model_params.items():
            logger.info(f"Performing grid search for {model_name}")
            grid_search = self.perform_grid_search(model, param_grid, x_train, y_train)

            best_params = grid_search.best_params_
            model.set_params(**best_params)
            model.fit(x_train, y_train)

            y_train_pred = model.predict(x_train)
            y_test_pred = model.predict(x_test)

            # Calculate training metrics
            train_model_score = r2_score(y_train, y_train_pred)
            test_model_score = r2_score(y_test, y_test_pred)

            results[model_name] = {
                'best_params': best_params,
                'model': model,
                'train_model_score': train_model_score,
                'test_model_score': test_model_score
            }
            # Export results to JSON
            with open(self.config.grid_search_evaluation_result, 'w') as f:
                json.dump(results, f, default=str, indent=4)
                logger.info(f"Training results exported to {self.config.grid_search_evaluation_result} ")
        return results

    def models_trainer(self, train_x, train_y, test_x, test_y):
        model_params = self.prepare_params_models_gs()
        results = self.evaluate_models(train_x, train_y, test_x, test_y, model_params)
        best_model_name = load_best_model_from_json(self.config.grid_search_evaluation_result)
        best_model = results[best_model_name]['model']
        logger.info("Best Model found is : {}".format(best_model))
        save_object_pkl(self.config.trained_model_file_path, best_model, )




In [14]:

# Main execution
train_data = pd.read_csv('artifacts/data_transformation/transformed_train_data.csv')
test_data = pd.read_csv('artifacts/data_transformation/transformed_test_data.csv')
target_column = '19'
train_x = train_data.drop([target_column], axis=1)
test_x = test_data.drop([target_column], axis=1)
train_y = train_data[target_column].values
test_y = test_data[target_column].values

In [17]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    
    model_params = model_trainer.models_trainer(train_x,train_y,test_x,test_y)
except Exception as e:
    raise e

[2024-06-26 07:24:33,047: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-26 07:24:33,049: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-26 07:24:33,051: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-26 07:24:33,051: INFO: common: created directory at: artifacts]
[2024-06-26 07:24:33,052: INFO: common: created directory at: artifacts/model_trainer]
[2024-06-26 07:24:33,053: INFO: 63491469: Performing grid search for Decision Tree]
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[2024-06-26 07:24:33,102: INFO: 63491469: Best params for DecisionTreeRegressor: {'criterion': 'friedman_mse'}]
[2024-06-26 07:24:33,102: INFO: 63491469: Best score for DecisionTreeRegressor: 0.717921328680716]
[2024-06-26 07:24:33,109: INFO: 63491469: Training results exported to artifacts/model_trainer/train_results.json ]
[2024-06-26 07:24:33,109: INFO: 63491469: Performing grid search for Random Forest]
Fitting 5 folds for each o