In [1]:
import os
%pwd

'd:\\Data Science\\END to END Proj\\Fertilizer_Pred_MLOPS\\research'

In [2]:
os.chdir("../")

In [3]:
from pathlib import Path
from dataclasses import dataclass
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import os
import joblib
from typing import Tuple, List
import warnings
warnings.filterwarnings('ignore')

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    original_data_path: Path
    train_label_path: Path
    original_label_path: Path
    model_dir: Path
    model_name: str
    target_column: str
    max_depth: int
    learning_rate: float
    reg_alpha: float
    reg_lambda: float
    gamma: float
    subsample: float
    colsample_bytree: float
    min_child_weight: int
    num_boost_round: int
    early_stopping_rounds: int
    n_folds: int
    random_seed: int
    n_jobs: int



In [4]:
from src.Fertilizer_Pred.utils.common import read_yaml, create_directories
from src.Fertilizer_Pred.constant import *
from pathlib import Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        try:
            config = self.config.model_trainer
            params = self.params.XGBoost
            training = self.params.training
            schema = self.schema.TARGET_COLUMN

            # Create required directories
            create_directories([
                Path(config.root_dir),
                Path(config.model_dir)
            ])

             # Verify files exist before proceeding
            required_files = [
                config.train_data_path,
                config.original_data_path,
                config.train_label_path,
                config.original_label_path
            ]

            for file_path in required_files:
                if not Path(file_path).exists():
                    raise FileNotFoundError(f"Required file not found: {file_path}")

            return ModelTrainerConfig(
                root_dir=Path(config.root_dir),
                train_data_path=Path(config.train_data_path),
                original_data_path=Path(config.original_data_path),
                train_label_path=Path(config.train_label_path),
                original_label_path=Path(config.original_label_path),
                model_dir=Path(config.model_dir),
                model_name=config.model_name,
                target_column=schema.name,
                # XGBoost parameters
                max_depth=params.max_depth,
                learning_rate=params.learning_rate,
                reg_alpha=params.reg_alpha,
                reg_lambda=params.reg_lambda,
                gamma=params.gamma,
                subsample=params.subsample,
                colsample_bytree=params.colsample_bytree,
                min_child_weight=params.min_child_weight,
                #   Training parameters
                num_boost_round=training.num_boost_round,
                early_stopping_rounds=training.early_stopping_rounds,
                n_folds=training.n_folds,
                random_seed=training.random_seed,
                n_jobs=training.n_jobs
             )
        except Exception as e:
            raise ValueError(f"Configuration error: {str(e)}") from e

In [5]:
import os
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from typing import Tuple, List
import warnings
warnings.filterwarnings('ignore')

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def _load_data(self) -> Tuple[pd.DataFrame, np.ndarray, List[str]]:
        """Load and validate training data"""
        try:
            # Verify files exist
            for path in [
                self.config.train_data_path,
                self.config.original_data_path,
                self.config.train_label_path,
                self.config.original_label_path
            ]:
                if not path.exists():
                    raise FileNotFoundError(f"Data file not found: {path}")

            # Load data
            df_train = pd.read_csv(self.config.train_data_path)
            df_original = pd.read_csv(self.config.original_data_path)
            
            train_labels = pd.read_csv(self.config.train_label_path)[self.config.target_column]
            original_labels = pd.read_csv(self.config.original_label_path)[self.config.target_column]
            
            # Encode labels
            le = LabelEncoder()
            y_train = le.fit_transform(train_labels)
            y_original = le.transform(original_labels)
            
            # Combine data
            X = pd.concat([df_train, df_original], axis=0)
            y = np.concatenate([y_train, y_original])
            
            return X, y, le.classes_
            
        except Exception as e:
            raise RuntimeError(f"Data loading failed: {str(e)}") from e

    def _train_model(self, X: pd.DataFrame, y: np.ndarray, classes: List[str]):
        """Train XGBoost model with configured parameters"""
        try:
            # Convert categorical columns to codes
            cat_cols = X.select_dtypes(include=['object', 'category']).columns
            for col in cat_cols:
                X[col] = X[col].astype('category').cat.codes

            params = {
                'objective': 'multi:softprob',
                'num_class': len(classes),
                'max_depth': self.config.max_depth,
                'learning_rate': self.config.learning_rate,
                'reg_alpha': self.config.reg_alpha,
                'reg_lambda': self.config.reg_lambda,
                'gamma': self.config.gamma,
                'subsample': self.config.subsample,
                'colsample_bytree': self.config.colsample_bytree,
                'min_child_weight': self.config.min_child_weight,
                'random_state': self.config.random_seed,
                'n_jobs': self.config.n_jobs,
                'tree_method': 'hist',
                'eval_metric': 'mlogloss',
                'enable_categorical': False  # We've already converted categories
            }

            skf = StratifiedKFold(n_splits=self.config.n_folds,
                                shuffle=True,
                                random_state=self.config.random_seed)

            for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
                print(f"\nTraining Fold {fold+1}")
                
                X_train, y_train = X.iloc[train_idx], y[train_idx]
                X_val, y_val = X.iloc[val_idx], y[val_idx]

                # Create DMatrix (no need for enable_categorical=True since we converted)
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dval = xgb.DMatrix(X_val, label=y_val)

                model = xgb.train(
                    params,
                    dtrain,
                    num_boost_round=self.config.num_boost_round,
                    early_stopping_rounds=self.config.early_stopping_rounds,
                    evals=[(dtrain, "train"), (dval, "val")],
                    verbose_eval=200
                )
                
                model_path = os.path.join(self.config.model_dir, f"xgb_fold{fold}.bin")
                model.save_model(model_path)
                print(f"Saved model for fold {fold+1} to {model_path}")

        except Exception as e:
            raise RuntimeError(f"Error during model training: {str(e)}")

    def train(self):
        """Execute full training pipeline"""
        try:
            os.makedirs(self.config.model_dir, exist_ok=True)
            X, y, classes = self._load_data()
            print(f"Starting training with {len(classes)} fertilizer classes")
            self._train_model(X, y, classes)
            print("Training completed successfully!")
        except Exception as e:
            raise RuntimeError(f"Training pipeline failed: {str(e)}")

In [6]:
try:
    config = ConfigurationManager()
    trainer_config = config.get_model_trainer_config()
    trainer = ModelTrainer(config=trainer_config)
    trainer.train()
except Exception as e:
    raise e


[2025-07-10 00:29:30,048: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-10 00:29:30,053: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-10 00:29:30,064: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-07-10 00:29:30,067: INFO: common: created directory at: artifacts]
[2025-07-10 00:29:30,069: INFO: common: created directory at: artifacts\model_trainer]
[2025-07-10 00:29:30,071: INFO: common: created directory at: artifacts\model_trainer\models]
Starting training with 7 fertilizer classes

Training Fold 1
[0]	train-mlogloss:1.94567	val-mlogloss:1.94571
[200]	train-mlogloss:1.92462	val-mlogloss:1.93156
[400]	train-mlogloss:1.91306	val-mlogloss:1.92611


In [8]:
import pandas as pd

print("Train labels columns:")
print(pd.read_csv("artifacts/data_transformation/train_labels.csv").columns.tolist())

print("Original labels columns:")
print(pd.read_csv("artifacts/data_transformation/original_labels.csv").columns.tolist())


Train labels columns:
['target']
Original labels columns:
['target']
