In [None]:
import os

In [None]:
%pwd


In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)

class DataTransformationConfig:
    root_dir : Path
    data_path: Path

In [None]:
from mlproject.constants import *
from mlProject.utils.common import read_yamn, create_utils


In [None]:
class ConfigurationManager

    def __init__(self, config_path: Path):
        self.config_path = config_path
        self.config = read_yaml(config_path)
        self.utils = create_utils(self.config['utils'])
        self.data_transformation_config = DataTransformationConfig(
            root_dir = Path(self.config['data']['root_dir']),
            data_path = Path(self.config['data']['data_path'])
        )

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config['root_dir']])
        data_transformation_config = DataTransformationConfig(
            root_dir = Path(config['root_dir']),
            data_path = Path(config['data_path'])
        )

        return data_transformation_config

In [None]:
from mlproject import logger
from mlprojects.components.data_cleaning import (
    FeatureEngineering,FixOutliers,DataPreprocessing)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
class DataTransformationManager:
    def __init__(self, data_transformation_config: DataTransformationConfig):
        self.data_transformation_config = data_transformation_config

    def data_transformation_pipeline(self):

        try:
            pipeline = Pipeline([
                ('data_preprocessing', DataPreprocessing()),
                ('feature_engineering', FeatureEngineering()),
                ('fix_outliers', FixOutliers())
            ])

            return pipeline
        except Exception as e:
            logger.error(f'Error in data transformation pipeline: {e}')
            raise e

    def transform_split_data(self):

        data = pd.read_csv(self.data_transformation_config.data_path)
        pipeline =      self.data_transformation_pipeline()

        processed_data = pipeline.fit_transform(data)   
        train,test = train_test_split(processed_data, test_size=0.2, random_state=42)

        train.to_csv(self.data_transformation_config.root_dir / 'train.csv', index=False)
        test.to_csv(self.data_transformation_config.root_dir / 'test.csv', index=False)

        logger.info('Data transformation and split completed successfully')
        logger.info(f'Train data shape: {train.shape}')
        logger.info(f'Test data shape: {test.shape}')

        print(train.shape, test.shape)


In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation_manager = DataTransformationManager(data_transformation_config)
    data_transformation_manager.transform_split_data()

    except Exception as e:
        logger.error(f'Error in data transformation: {e}')
        raise e