In [1]:
import os

In [2]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path


In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [8]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import joblib

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.data = self.feature_eng_data_transform()  # Call feature_eng_data_transform upon initialization

    ## Note: You can add i want to create a new feature to the data and then split 
    # df['Discount Percentage'] = ((df['Total Sales'] - df['Discounted Sales']) / df['Total Sales']) * 100
    def feature_eng_data_transform(self):
        data = pd.read_csv(self.config.data_path)

        data['Discount Percentage'] = ((data['Total Sales'] - data['Discounted Sales']) / data['Total Sales']) * 100
        data['Unique Items per Total Item'] = data['Unique Items'] / data['Total Items']
        data['Month'] = pd.to_datetime(data['Date']).dt.month
        logger.info("New feature created")

        data.drop(columns=['Customer ID', 'Transaction ID','Date'], inplace=True)
        logger.info("Useless columns were dropped")

        data['Month'] = data['Month'].astype(str)
        data['Loyalty Card'] = data['Loyalty Card'].astype(str)
        logger.info("Data types of 'Month' and 'Loyalty Card' were changed to string")
        return data

    def train_test_splitting(self):
        X = self.data.drop('Incomplete Transaction', axis=1)
        y = self.data['Incomplete Transaction']
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)

        train_df = pd.concat([X_train, y_train], axis=1)
        test_df = pd.concat([X_test, y_test], axis=1)

        train_df.to_csv(os.path.join(self.config.root_dir, "train_df.csv"), index=False)
        test_df.to_csv(os.path.join(self.config.root_dir, "test_df.csv"), index=False)

        logger.info("data into training and test sets")
        logger.info(X_train.shape)
        logger.info(y_train.shape)

        print(X_train.shape)
        print(y_train.shape)


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2024-02-23 08:23:39,384: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-23 08:23:39,386: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-23 08:23:39,388: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-23 08:23:39,389: INFO: common: created directory at: artifacts]
[2024-02-23 08:23:39,391: INFO: common: created directory at: artifacts/data_transformation]
[2024-02-23 08:23:39,407: INFO: 2668210616: New feature created]
[2024-02-23 08:23:39,411: INFO: 2668210616: Useless columns were dropped]
[2024-02-23 08:23:39,415: INFO: 2668210616: Data types of 'Month' and 'Loyalty Card' were changed to string]
[2024-02-23 08:23:39,446: INFO: 2668210616: data into training and test sets (scalled and imputed)]
[2024-02-23 08:23:39,447: INFO: 2668210616: (4250, 17)]
[2024-02-23 08:23:39,448: INFO: 2668210616: (4250,)]
(4250, 17)
(4250,)


# XGBoost data transformatio 

In [7]:
import os
os.chdir("../")


In [8]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    preprocessor_obj_file_path: Path

In [9]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            preprocessor_obj_file_path=config.preprocessor_obj_file_path,
        )

        return data_transformation_config

In [11]:

import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import joblib


from imblearn.pipeline import  make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.data = self.feature_eng_data_transform()  # Call feature_eng_data_transform upon initialization

    ## Note: You can add i want to create a new feature to the data and then split 
    # df['Discount Percentage'] = ((df['Total Sales'] - df['Discounted Sales']) / df['Total Sales']) * 100
    def feature_eng_data_transform(self):
        data = pd.read_csv(self.config.data_path)

        data['Discount Percentage'] = ((data['Total Sales'] - data['Discounted Sales']) / data['Total Sales']) * 100
        data['Unique Items per Total Item'] = data['Unique Items'] / data['Total Items']
        data['Month'] = pd.to_datetime(data['Date']).dt.month
        logger.info("New feature created")

        data.drop(columns=['Customer ID', 'Transaction ID','Date'], inplace=True)
        logger.info("Useless columns were dropped")

        data['Month'] = data['Month'].astype(str)
        data['Loyalty Card'] = data['Loyalty Card'].astype(str)
        logger.info("Data types of 'Month' and 'Loyalty Card' were changed to string")
        return data

    def train_test_splitting(self):
        X = self.data.drop('Incomplete Transaction', axis=1)
        y = self.data['Incomplete Transaction']
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)
        return X_train, X_test, y_train, y_test


    def get_data_transformer_object(self):
        '''
        Get data transformation object for preprocessing.
        '''

        # Define numerical and categorical features
        X = self.data.drop('Incomplete Transaction', axis=1)
        num_features = X.select_dtypes(exclude="object").columns
        cat_features = X.select_dtypes(include="object").columns

        # Define a pipeline for processing numeric features
        numeric_processor = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy='mean')),
                ("scaler", StandardScaler())
            ]
        )

        # Define a pipeline for processing categorical features
        categorical_processor = Pipeline(
            steps=[
                ("Imputer", SimpleImputer(strategy='most_frequent')),
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
            ]
        )

        logger.info(f"Categorical columns: {cat_features}")
        logger.info(f"Numerical columns: {num_features}")

        # Combine numeric and categorical processors
        preprocessor = ColumnTransformer(
            transformers=[
                ("numerical", numeric_processor, num_features),
                ("categorical", categorical_processor, cat_features)
            ]
        )

        return preprocessor
    
    
    def initiate_data_transformation(self):
        preprocessing_obj = self.get_data_transformer_object()
        X_train, X_test, y_train, y_test = self.train_test_splitting()
        X_train = preprocessing_obj.fit_transform(X_train)
        X_test = preprocessing_obj.transform(X_test)

    
        #balance the train dataset
        balancer =RandomOverSampler(random_state=42)
        X_train, y_train= balancer.fit_resample(X_train, y_train)
        logger.info("Train dataset balanced")

        # Combine input features with target feature
        train_arr = np.c_[X_train, y_train]
        test_arr = np.c_[X_test, y_test]

        train_df = pd.DataFrame(train_arr)
        test_df = pd.DataFrame(test_arr)

        train_df.to_csv(os.path.join(self.config.root_dir, "train_df.csv"), index=False)
        test_df.to_csv(os.path.join(self.config.root_dir, "test_df.csv"), index=False)

        logger.info("data into training and test sets (scalled and imputed)")
        logger.info(X_train.shape)
        logger.info(y_train.shape)

        print(X_train.shape)
        print(y_train.shape)

        joblib.dump(preprocessing_obj, os.path.join(self.config.preprocessor_obj_file_path))

In [13]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.initiate_data_transformation()
except Exception as e:
    raise e

[2024-02-23 13:23:50,066: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-23 13:23:50,068: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-23 13:23:50,071: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-23 13:23:50,073: INFO: common: created directory at: artifacts]
[2024-02-23 13:23:50,075: INFO: common: created directory at: artifacts/data_transformation]
[2024-02-23 13:23:50,092: INFO: 986554103: New feature created]
[2024-02-23 13:23:50,092: INFO: 986554103: Useless columns were dropped]
[2024-02-23 13:23:50,097: INFO: 986554103: Data types of 'Month' and 'Loyalty Card' were changed to string]
[2024-02-23 13:23:50,099: INFO: 986554103: Categorical columns: Index(['Gender', 'Region', 'Marital Status', 'Education', 'Loyalty Card',
       'Month'],
      dtype='object')]
[2024-02-23 13:23:50,099: INFO: 986554103: Numerical columns: Index(['Total Items', 'Unique Items', 'Total Sales', 'Discounted Sales',
       'Browsing 