In [64]:
import os

In [65]:
%pwd

'c:\\Users\\Lenovo\\Desktop'

In [46]:
os.chdir("../")


In [75]:
%pwd

'c:\\Users\\Lenovo\\Desktop'

In [76]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [77]:
from dataclasses import dataclass
from pathlib import Path


from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: int
    learning_rate:int
    max_depth: str
    min_samples_split: int
    min_samples_leaf:int
    subsample:int
    random_state: int
    target_column: str


In [78]:
from stb_pfe_mlflow.constants import *
from stb_pfe_mlflow.utils.common import read_yaml, create_directories

In [79]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.GradientBoostingClassifier
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            n_estimators = params.n_estimators,
            learning_rate = params.learning_rate,
            max_depth = params.max_depth,
            min_samples_split = params.min_samples_split,
            min_samples_leaf = params.min_samples_leaf,
            subsample = params.subsample,
            random_state = params.random_state,
            target_column = schema.name
            
        )

        return model_trainer_config

In [80]:
import pandas as pd
import os
from stb_pfe_mlflow import logger
from sklearn.linear_model import ElasticNet
import joblib

In [83]:
import pandas as pd
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Load the data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)
        # Remove 'tiers_key' and target 'Cluster' for X data
        train_x = train_data.drop(columns=["tiers_key", "Cluster"])
        test_x = test_data.drop(columns=["tiers_key", "Cluster"])

        # Extract the target
        train_y = train_data["Cluster"]
        test_y = test_data["Cluster"]
        # Combine train and test for consistent encoding
        combined = pd.concat([train_x, test_x], axis=0)

        # Encode categorical features
        for col in combined.select_dtypes(include=['object']).columns:
            le = LabelEncoder()
            combined[col] = le.fit_transform(combined[col].astype(str))

        # Split back into train and test sets
        train_x_encoded = combined.iloc[:len(train_x)]
        test_x_encoded = combined.iloc[len(train_x):]

        # Encode the target labels
        label_encoder = LabelEncoder()
        train_y_encoded = label_encoder.fit_transform(train_y)
        test_y_encoded = label_encoder.transform(test_y)

        # Standardize the features
        scaler = StandardScaler()
        train_x_scaled = scaler.fit_transform(train_x_encoded)
        test_x_scaled = scaler.transform(test_x_encoded)
        
        gbs = GradientBoostingClassifier(
            n_estimators=self.config.n_estimators,      # Number of boosting stages
            learning_rate=self.config.learning_rate,     # Step size shrinkage
            max_depth=self.config.max_depth,           # Maximum depth of the individual trees
            min_samples_split=self.config.min_samples_split,   # Minimum number of samples required to split an internal node
            min_samples_leaf=self.config.min_samples_leaf,    # Minimum number of samples required to be at a leaf node
            subsample=self.config.subsample,         # Fraction of samples to be used for fitting the individual base learners
            random_state=self.config.random_state
        )
        
    
        gbs.fit(train_x_scaled, train_y_encoded)
        

        # Save the model
        joblib.dump(gbs, os.path.join(self.config.root_dir, self.config.model_name))

        

        print("Model training complete and saved.")

# Assuming you have defined `ModelTrainerConfig` with appropriate attributes


In [84]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

FileNotFoundError: [Errno 2] No such file or directory: 'config\\config.yaml'

In [11]:
from dataclasses import dataclass
from pathlib import Path


from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    priors:List[float]
    var_smoothing: float
    target_column: str


In [12]:
from stb_pfe_mlflow.constants import *
from stb_pfe_mlflow.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.GaussianNB
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            priors = params.priors,
            var_smoothing = params.var_smoothing,
            target_column = schema.name
            
        )

        return model_trainer_config

In [14]:
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [15]:
import pandas as pd
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier


class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Load the data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[self.config.target_column]
        test_y = test_data[self.config.target_column]

        # Encode the target column
        label_encoder = LabelEncoder()
        train_y_encoded = label_encoder.fit_transform(train_y)
        test_y_encoded = label_encoder.transform(test_y)
        scaler = StandardScaler()
        train_x_scaled = scaler.fit_transform(train_x)
        test_x_scaled = scaler.transform(test_x)
       

        # Initialize and train the Gaussian Naive Bayes model with parameters
        # You can adjust priors and var_smoothing as needed
        gnb = GaussianNB(priors=self.config.priors, var_smoothing=self.config.var_smoothing)  # Adjust var_smoothing if needed
        gnb.fit(train_x_scaled, train_y_encoded)

    
        # Save the model
        joblib.dump(gnb, os.path.join(self.config.root_dir, self.config.model_name))

        

        print("Model training complete and saved.")

# Assuming you have defined `ModelTrainerConfig` with appropriate attributes


In [16]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-09-24 11:21:51,078: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-24 11:21:51,078: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-24 11:21:51,078: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-24 11:21:51,078: INFO: common: created directory at: artifacts]
[2024-09-24 11:21:51,087: INFO: common: created directory at: artifacts/model_trainer]
Model training complete and saved.
