In [1]:
import os

In [2]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret'

# XGBoost data transformatio 

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Configuration class for data transformation.

    Attributes:
    root_dir (Path): The root directory where transformed data will be stored.
    data_path (Path): The path to the data to be transformed.
    preprocessor_obj_file_path (Path): The file path of the preprocessor object.
    """
    root_dir: Path
    data_path: Path
    preprocessor_obj_file_path: Path


In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            preprocessor_obj_file_path=config.preprocessor_obj_file_path,
        )

        return data_transformation_config

In [8]:

import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import joblib

from imblearn.pipeline import  make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import joblib

class DataTransformation:
    """
    Class for data transformation tasks.

    Attributes:
    config (DataTransformationConfig): The configuration for data transformation.
    """

    def __init__(self, config: DataTransformationConfig):
        """
        Initialize DataTransformation with a configuration object.

        Args:
        config (DataTransformationConfig): The configuration for data transformation.
        """
        self.config = config
        self.data = self.feature_eng_data_transform()  # Call feature_eng_data_transform upon initialization

    def feature_eng_data_transform(self):
        """
        Perform feature engineering on the data.

        Returns:
        pandas.DataFrame: The transformed data.
        """
        data = pd.read_csv(self.config.data_path)

        data['Discount Percentage'] = ((data['Total Sales'] - data['Discounted Sales']) / data['Total Sales']) * 100
        data['Unique Items per Total Item'] = data['Unique Items'] / data['Total Items']
        data['Month'] = pd.to_datetime(data['Date']).dt.month

        data.drop(columns=['Customer ID', 'Transaction ID', 'Date'], inplace=True)

        data['Month'] = data['Month'].astype(str)
        data['Loyalty Card'] = data['Loyalty Card'].astype(str)

        return data

    def train_test_splitting(self):
        """
        Split the data into training and testing sets.

        Returns:
        tuple: X_train, X_test, y_train, y_test
        """
        X = self.data.drop('Incomplete Transaction', axis=1)
        y = self.data['Incomplete Transaction']
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)
        return X_train, X_test, y_train, y_test

    def get_data_transformer_object(self):
        """
        Get data transformation object for preprocessing.

        Returns:
        ColumnTransformer: Preprocessor object.
        """
        X = self.data.drop('Incomplete Transaction', axis=1)
        num_features = X.select_dtypes(exclude="object").columns
        cat_features = X.select_dtypes(include="object").columns

        numeric_processor = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy='mean')),
                ("scaler", StandardScaler())
            ]
        )

        categorical_processor = Pipeline(
            steps=[
                ("Imputer", SimpleImputer(strategy='most_frequent')),
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("numerical", numeric_processor, num_features),
                ("categorical", categorical_processor, cat_features)
            ]
        )

        return preprocessor

    def initiate_data_transformation(self):
        """
        Perform data transformation and save the preprocessed data.

        """
        preprocessing_obj = self.get_data_transformer_object()
        X_train, X_test, y_train, y_test = self.train_test_splitting()
        X_train = preprocessing_obj.fit_transform(X_train)
        X_test = preprocessing_obj.transform(X_test)

        balancer = RandomOverSampler(random_state=42)
        X_train, y_train = balancer.fit_resample(X_train, y_train)

        train_arr = np.c_[X_train, y_train]
        test_arr = np.c_[X_test, y_test]

        train_df = pd.DataFrame(train_arr)
        test_df = pd.DataFrame(test_arr)

        train_df.to_csv(os.path.join(self.config.root_dir, "train_df.csv"), index=False)
        test_df.to_csv(os.path.join(self.config.root_dir, "test_df.csv"), index=False)

        joblib.dump(preprocessing_obj, os.path.join(self.config.preprocessor_obj_file_path))


In [10]:
try:
    # Initialize ConfigurationManager and retrieve data transformation configuration
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()

    # Initialize DataTransformation with the retrieved configuration
    data_transformation = DataTransformation(config=data_transformation_config)

    # Perform data transformation
    data_transformation.initiate_data_transformation()

except Exception as e:
    # Raise the caught exception
    raise e


[2024-02-28 11:20:16,343: INFO: common: YAML file loaded successfully from: config\config.yaml]
[2024-02-28 11:20:16,344: INFO: common: YAML file loaded successfully from: params.yaml]
[2024-02-28 11:20:16,347: INFO: common: YAML file loaded successfully from: schema.yaml]
[2024-02-28 11:20:16,348: INFO: common: Created directory at: artifacts]
[2024-02-28 11:20:16,350: INFO: common: Created directory at: artifacts/data_transformation]
