In [2]:
import os

In [3]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: int
    target_column: str

In [7]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBClassifier
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            n_estimators = params.n_estimators,
            target_column = schema.name
            
        )

        return model_trainer_config

In [21]:
import pandas as pd
import os
from mlProject import logger
from xgboost import XGBClassifier
import joblib
from mlProject.utils.common import read_yaml, create_directories, evaluate_clf

#from mlProject.utils.common import evaluate_clf

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from imblearn.pipeline import  make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold


from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier





In [39]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from datetime import datetime


def gridsearch(models, X_train, y_train, X_test, y_test, preprocessor, balancer, param_grids,feature_names, kf, scoring, save_models=False, output_directory=None):
    results = {}

    for model_name, model in models.items():
        pipe = make_pipeline(preprocessor, balancer, model)

        # Perform grid search
        grid_search = GridSearchCV(pipe, param_grid =param_grids[model_name], cv=kf, scoring=scoring, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Get the best model and its hyperparameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # Make predictions
        y_train_pred = best_model.predict(X_train)
        classifier_step_key = model_name.lower()
        print(best_model.named_steps.keys())
        # Apply ColumnTransformer to the test data
        X_test_transformed = best_model.named_steps['columntransformer'].transform(X_test)
        # Use the transformed test data to make predictions using the classifier step
        y_test_pred = best_model.named_steps[classifier_step_key].predict(X_test_transformed)
        #y_test_pred = best_model.named_steps['columntransformer',classifier_step_key].predict(X_test) # avoid data leakage and sampling on test set by only using classifier step 

        # Evaluate Train and Test dataset
        train_acc, train_f1 , train_precision, train_recall, train_roc_au = evaluate_clf(y_train, y_train_pred)
        test_acc, test_f1 , test_precision, test_recall, test_roc_au = evaluate_clf(y_test, y_test_pred)

        #store results in dictionary
        results[model_name] = {
            'Best Hyperparameters': best_params,
            'Best Score': best_score,
            'Train Accuracy': train_acc,
            'Train F1 Score': train_f1,
            'Train Precision': train_precision,
            'Train Recall': train_recall,
            'Train Roc Auc': train_roc_au,
            'Test Accuracy': test_acc,
            'Test F1 Score': test_f1,
            'Test Precision': test_precision,
            'Test Recall': test_recall,
            'Test Roc Auc': test_roc_au
        }

        # Save the best model to the specified directory if specified
        if save_models and output_directory:
            current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = f"{model_name.replace(' ', '_')}_{current_datetime}.joblib"
            model_filepath = os.path.join(output_directory, model_filename)
            joblib.dump(best_model, model_filepath)
            results[model_name]['Saved Model Filepath'] = model_filepath

                # For models that support feature importance or coefficients
        if hasattr(best_model.named_steps[classifier_step_key], 'feature_importances_'):
            results[model_name]['Feature Importances'] = dict(zip(feature_names, best_model.named_steps[classifier_step_key].feature_importances_))
        elif hasattr(best_model.named_steps[classifier_step_key], 'coef_'):
            results[model_name]['Coefficients'] = dict(zip(feature_names, best_model.named_steps[classifier_step_key].coef_))

    return results

In [40]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.preprocessor_pipe =self.preprocessor_pipe()

    
    def preprocessor_pipe(self):
        '''
        Get data transformation object for preprocessing.
        '''
        # Define numerical and categorical features
        train_data = pd.read_csv(self.config.train_data_path)
        neededColumns = train_data.drop('Incomplete Transaction', axis=1)
        num_features = neededColumns.select_dtypes(exclude="object").columns
        cat_features = neededColumns.select_dtypes(include="object").columns

        # Define a pipeline for processing numeric features
        numeric_processor = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy='mean')),
                ("scaler", StandardScaler())
            ]
        )

        # Define a pipeline for processing categorical features
        categorical_processor = Pipeline(
            steps=[
                ("Imputer", SimpleImputer(strategy='most_frequent')),
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
            ]
        )

        logger.info(f"Categorical columns: {cat_features}")
        logger.info(f"Numerical columns: {num_features}")

        # Combine numeric and categorical processors
        preprocessor = ColumnTransformer(
            transformers=[
                ("numerical", numeric_processor, num_features),
                ("categorical", categorical_processor, cat_features)
            ]
        )

        return preprocessor


    def initiate_model_trainer(self):
        logger.info("Initiating model training")
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        X_train = train_data.drop([self.config.target_column], axis=1)
        X_test = test_data.drop([self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]
        y_test = test_data[self.config.target_column]


        models = {
            "RandomForestClassifier": RandomForestClassifier(),
            "DecisionTreeClassifier": DecisionTreeClassifier(),
            "GradientBoostingClassifier": GradientBoostingClassifier(),
            "LogisticRegression": LogisticRegression(),
            "KNeighborsClassifier": KNeighborsClassifier(),
            "XGBClassifier": XGBClassifier(), 
            "CatBoostClassifier": CatBoostClassifier(verbose=False),
            "AdaBoostClassifier": AdaBoostClassifier(),
            "SVC": SVC()
        } 

        param_grids = {
            "RandomForestClassifier": {
                "randomforestclassifier__n_estimators": [100],
                "randomforestclassifier__max_depth": [None],
                "randomforestclassifier__min_samples_split": [2],
                "randomforestclassifier__min_samples_leaf": [1],
                "randomforestclassifier__bootstrap": [True]
            },
            "DecisionTreeClassifier": {
                "decisiontreeclassifier__max_depth": [None],
                "decisiontreeclassifier__min_samples_split": [2],
                "decisiontreeclassifier__min_samples_leaf": [1]
            },
            "GradientBoostingClassifier": {
                "gradientboostingclassifier__n_estimators": [100],
                "gradientboostingclassifier__learning_rate": [0.1],
                "gradientboostingclassifier__max_depth": [3],
                "gradientboostingclassifier__min_samples_split": [2],
                "gradientboostingclassifier__min_samples_leaf": [1]
            },
            "LogisticRegression": {
                "logisticregression__C": [1],
                "logisticregression__penalty": ['l2']
            },
            "KNeighborsClassifier": {
                "kneighborsclassifier__n_neighbors": [5],
                "kneighborsclassifier__weights": ['uniform'],
                "kneighborsclassifier__metric": ['euclidean']
            },
            "XGBClassifier": {
                "xgbclassifier__n_estimators": [100],
                "xgbclassifier__max_depth": [3],
                "xgbclassifier__learning_rate": [0.1],
                "xgbclassifier__subsample": [0.8],
                "xgbclassifier__colsample_bytree": [0.8],
                "xgbclassifier__reg_alpha": [0.001],
                "xgbclassifier__reg_lambda": [0.001]
            },
            "CatBoostClassifier": {
                "catboostclassifier__iterations": [100],
                "catboostclassifier__learning_rate": [0.1],
                "catboostclassifier__depth": [6],
                "catboostclassifier__l2_leaf_reg": [3]
            },
            "AdaBoostClassifier": {
                "adaboostclassifier__n_estimators": [100],
                "adaboostclassifier__learning_rate": [0.1]
            },
            "SVC": {
                "svc__C": [1],
                "svc__kernel": ['rbf'],
                "svc__gamma": ['scale']
            }
        }
        
        preprocessor = self.preprocessor_pipe
        balancer = RandomUnderSampler(random_state=42)
        feature_names= X_train.columns
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        scoring ='f1'
        save_models=True
        output_directory =self.config.root_dir

        logger.info("Initiating grid search for models")
        results =gridsearch(models=models,param_grids= param_grids,X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,preprocessor = preprocessor,
                balancer = balancer,
                feature_names= feature_names,
                kf=kf,
                scoring =scoring, 
                save_models=save_models, 
                output_directory= output_directory)



        return results


In [42]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    results = model_trainer_config.initiate_model_trainer()
except Exception as e:
    raise e

[2024-02-23 10:21:32,574: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-23 10:21:32,576: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-23 10:21:32,579: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-23 10:21:32,580: INFO: common: created directory at: artifacts]
[2024-02-23 10:21:32,581: INFO: common: created directory at: artifacts/model_trainer]
[2024-02-23 10:21:32,595: INFO: 2643503123: Categorical columns: Index(['Gender', 'Region', 'Marital Status', 'Education'], dtype='object')]
[2024-02-23 10:21:32,596: INFO: 2643503123: Numerical columns: Index(['Total Items', 'Unique Items', 'Total Sales', 'Discounted Sales',
       'Browsing Duration (minutes)', 'Number of Clicks', 'Age',
       'Household Income', 'Loyalty Card', 'Loyalty Points',
       'Discount Percentage', 'Unique Items per Total Item', 'Month'],
      dtype='object')]
[2024-02-23 10:21:32,597: INFO: 2643503123: Initiating model training]
dict_keys(['c

In [None]:
#

# XGboost

In [1]:
import os
os.chdir("../")

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: int
    max_depth: int
    learning_rate: int
    random_state: int
    scale_pos_weight: int
    min_child_weight: int
    subsample: int
    target_column: str



In [3]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBClassifier
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            n_estimators = params.n_estimators,
            max_depth= params.max_depth,
            learning_rate= params.learning_rate,
            random_state= params.random_state,
            scale_pos_weight= params.scale_pos_weight,
            min_child_weight= params.min_child_weight,
            subsample= params.subsample,
            target_column = schema.name
            
        )

        return model_trainer_config
    

In [5]:
import pandas as pd
import os
from mlProject import logger
from xgboost import XGBClassifier
import joblib
from mlProject.utils.common import read_yaml, create_directories, evaluate_clf

#from mlProject.utils.common import evaluate_clf

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from imblearn.pipeline import  make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold


# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
# from catboost import CatBoostClassifier





In [6]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config




    def initiate_model_trainer(self):
        logger.info("Initiating model training")
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        X_train = train_data.iloc[:, :-1]
        X_test = test_data.iloc[:, :-1]
        y_train = train_data.iloc[:, -1]
        y_test = test_data.iloc[:, -1]



        xgb = XGBClassifier( n_estimators=self.config.n_estimators, max_depth=self.config.max_depth, 
                            learning_rate=self.config.learning_rate, random_state=self.config.random_state,
                            scale_pos_weight=self.config.scale_pos_weight, min_child_weight=self.config.min_child_weight, 
                            subsample=self.config.subsample)
        
        xgb.fit(X_train, y_train)

        joblib.dump(xgb, os.path.join(self.config.root_dir, self.config.model_name))


In [7]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.initiate_model_trainer()
except Exception as e:
    raise e

[2024-02-23 13:05:14,116: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-23 13:05:14,118: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-23 13:05:14,121: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-23 13:05:14,122: INFO: common: created directory at: artifacts]
[2024-02-23 13:05:14,123: INFO: common: created directory at: artifacts/model_trainer]
[2024-02-23 13:05:14,124: INFO: 3392366906: Initiating model training]


# Gradient boosting classifier

In [2]:
import os
os.chdir("../")

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: int
    max_depth: int
    learning_rate: int
    random_state: int
    min_samples_split: int
    subsample: int
    min_samples_leaf: int
    target_column: str


In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.GBMClassifier
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,

            n_estimators = params.n_estimators,
            max_depth= params.max_depth,
            learning_rate= params.learning_rate,
            random_state= params.random_state,
            subsample= params.subsample,
            min_samples_split= params.min_samples_split,
            min_samples_leaf= params.min_samples_leaf,

            target_column = schema.name
            
        )

        return model_trainer_config

In [13]:
import pandas as pd
import os
from mlProject import logger
from sklearn.ensemble import  GradientBoostingClassifier

import joblib
from mlProject.utils.common import read_yaml, create_directories, evaluate_clf

#from mlProject.utils.common import evaluate_clf

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from imblearn.pipeline import  make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold


In [14]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config




    def initiate_model_trainer(self):
        logger.info("Initiating model training")
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        X_train = train_data.iloc[:, :-1]
        X_test = test_data.iloc[:, :-1]
        y_train = train_data.iloc[:, -1]
        y_test = test_data.iloc[:, -1]

        GBM = GradientBoostingClassifier(n_estimators=self.config.n_estimators, max_depth=self.config.max_depth, 
                            learning_rate=self.config.learning_rate, random_state=self.config.random_state,
                            subsample=self.config.subsample, min_samples_split=self.config.min_samples_split,
                            min_samples_leaf=self.config.min_samples_leaf)



        GBM.fit(X_train, y_train)

        joblib.dump(GBM, os.path.join(self.config.root_dir, self.config.model_name))

In [15]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.initiate_model_trainer()
except Exception as e:
    raise e

[2024-02-27 16:09:17,992: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-27 16:09:17,993: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-27 16:09:17,996: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-27 16:09:17,996: INFO: common: created directory at: artifacts]
[2024-02-27 16:09:17,998: INFO: common: created directory at: artifacts/model_trainer]
[2024-02-27 16:09:17,998: INFO: 3115623685: Initiating model training]
