In [1]:
import os

In [2]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Regression-Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'C:\\Users\\iheba\\IdeaProjects\\Mlops-Regression-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    data_train: Path
    data_test: Path
    transformed_data_train: Path
    transformed_data_test: Path
    preprocessor_obj_file_path: Path
    numerical_columns: list
    categorical_columns: list
    target_column: str

In [6]:
from RegressionProject.constants import *
from RegressionProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH,
            schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            data_train=config.data_train,
            data_test=config.data_test,
            transformed_data_train=config.transformed_data_train,
            transformed_data_test=config.transformed_data_test,
            preprocessor_obj_file_path=config.preprocessor_obj_file_path,
            numerical_columns=config.numerical_columns,
            categorical_columns=config.categorical_columns,
            target_column=schema.TARGET_COLUMN.name
        )

        return data_transformation_config

In [8]:
from RegressionProject.logging import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from RegressionProject.utils.common import save_object_pkl


In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_data_transformer_object(self):
        try:

            numerical_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]
            )

            categorical_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("one_hot_encoder", OneHotEncoder()),
                    ("scaler", StandardScaler(with_mean=False))
                ]
            )

            logger.info(f"Categorical columns: {self.config.categorical_columns}")
            logger.info(f"Numerical columns: {self.config.numerical_columns}")

            preprocessor = ColumnTransformer(
                [
                    ("Numerical_pipeline", numerical_pipeline, self.config.numerical_columns),
                    ("Categorical_pipeline", categorical_pipeline, self.config.categorical_columns)
                ]
            )

            return preprocessor

        except Exception as e:
            raise e

    def train_test_splitting(self):
        data = pd.read_csv(self.config.data_path)
        train, test = train_test_split(data, test_size=0.25, random_state=42)
        train.to_csv(self.config.data_train, index=False)
        test.to_csv(self.config.data_test, index=False)
        logger.info("Split data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

    def load_data(self):
        train_df = pd.read_csv(self.config.data_train)
        test_df = pd.read_csv(self.config.data_test)
        logger.info("Read train and test data is completed")
        return train_df, test_df
    
    def separate_features_and_target(self, test_df, train_df, target_column_name):
        input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
        target_feature_train_df = train_df[target_column_name]
        logger.info("Separating X and Y for train data is completed")

        input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
        target_feature_test_df = test_df[target_column_name]
        logger.info("Separating X and Y for test data is completed")
        return input_feature_train_df, target_feature_train_df, input_feature_test_df, target_feature_test_df

    def apply_preprocessing(self, preprocessing_obj, input_feature_train_df, input_feature_test_df,target_feature_train_df,target_feature_test_df):
        input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
        input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
        logger.info(f"Preprocessing object is applied on training and testing data.")
        train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
        test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
        return train_arr, test_arr
    
    def save_transformed_data(self, train_arr, test_arr):
        train_transformed_df = pd.DataFrame(train_arr)
        test_transformed_df = pd.DataFrame(test_arr)
        train_transformed_df.to_csv(self.config.transformed_data_train, index=False)
        test_transformed_df.to_csv(self.config.transformed_data_test, index=False)
        logger.info("Saved transformed train and test data as CSV files.")


    def save_preprocessor(self, preprocessor):
        try:
            save_object_pkl(file_path=self.config.preprocessor_obj_file_path, obj=preprocessor)
            logger.info("Saved preprocessing object.")
        except Exception as e:
            logger.error("Failed to save preprocessing object:", e)


    def run_data_processing_pipeline(self):
        try:
            # Get preprocessor
            preprocessor = self.get_data_transformer_object()

            # Split data
            self.train_test_splitting()

            # Load data
            train_df, test_df = self.load_data()

            # Separate features and target
            input_feature_train_df, target_feature_train_df, input_feature_test_df, target_feature_test_df = self.separate_features_and_target(test_df, train_df, self.config.target_column)

            # Apply preprocessing
            train_arr, test_arr = self.apply_preprocessing(preprocessor, input_feature_train_df, input_feature_test_df, target_feature_train_df, target_feature_test_df)

            # Save transformed data
            self.save_transformed_data(train_arr, test_arr)

            # Save preprocessor
            self.save_preprocessor(preprocessor)

            logger.info("Data processing pipeline completed successfully.")
        except Exception as e:
            logger.error("Error in data processing pipeline:", e)


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.run_data_processing_pipeline()
except Exception as e:
    raise e

[2024-06-25 08:12:30,795: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-25 08:12:30,796: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-25 08:12:30,797: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-25 08:12:30,798: INFO: common: created directory at: artifacts]
[2024-06-25 08:12:30,799: INFO: common: created directory at: artifacts/data_transformation]
[2024-06-25 08:12:30,799: INFO: 3341653323: Categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']]
[2024-06-25 08:12:30,800: INFO: 3341653323: Numerical columns: ['writing_score', 'reading_score']]
[2024-06-25 08:12:30,807: INFO: 3341653323: Split data into training and test sets]
[2024-06-25 08:12:30,807: INFO: 3341653323: (750, 8)]
[2024-06-25 08:12:30,808: INFO: 3341653323: (250, 8)]
[2024-06-25 08:12:30,823: INFO: 3341653323: Read train and test data is completed]
[2024-06-25 08:12:30,824: INFO: 3341653