In [2]:
import os 
os.chdir('../')
%pwd

'd:\\pythonProjects\\SurgeSense'

# data transformation steps 

In [3]:
# entity
from dataclasses import dataclass
from pathlib import Path 
@dataclass
class DataTransformationConfig:
    root_dir: Path 
    data_path: Path
    categorical_columns: list
    numerical_columns: list 



In [4]:
# config 
from SurgeSense.constants import * 
from SurgeSense.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self)->DataTransformationConfig:
        config=self.config.data_transformation 
        schema=self.schema.TRANSFORM
        create_directories([config.root_dir])

        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            categorical_columns=schema.CATEGORICAL_DATA,
            numerical_columns=schema.NUMERICAL_DATA
        )
        return data_transformation_config

In [9]:
# components
import os 
from SurgeSense import logger
from sklearn.model_selection import train_test_split
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
import os 
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split 


class DataTransformation: 
    def __init__(self,config:DataTransformationConfig):
        self.config=config


    def transform_data_pipeline(self):
        data=pd.read_csv(self.config.data_path)
        categorical_columns=self.config.categorical_columns
        numerical_columns=self.config.numerical_columns
        numerical_preprocessor=Pipeline(
            steps=[
                ('imputation_menu',SimpleImputer(missing_values=np.nan,strategy='median')),
                ('scalar',StandardScaler())
            ]
        )

        categorical_preprocessor=Pipeline(
            steps=[
                ('imputation_constant',SimpleImputer(strategy='most_frequent')),
                ('encode',OneHotEncoder(handle_unknown='ignore'))
            ]
        )

        preprocessor=ColumnTransformer(
            transformers=[
                ('categorical_columns',categorical_preprocessor,categorical_columns),
                ('numerical_columns',numerical_preprocessor,numerical_columns)
            ]
        )

        pipe=Pipeline(
            steps=[
                ('preprocessor',preprocessor)
            ]
        )
        return pipe 
    
    def train_test_spliting(self,pipe: Pipeline):
        data=pd.read_csv(self.config.data_path)
        data_transformed=pipe.fit_transform(data)
        logger.info('Transforming the data')
        train,test=train_test_split(pd.DataFrame(data_transformed))
        train.to_csv(os.path.join(self.config.root_dir,'train.csv'),index=False)
        test.to_csv(os.path.join(self.config.root_dir,'test.csv'),index=False)
        logger.info('splitting the data into train and test set')
        logger.info(f'training set shape: {train.shape}')
        logger.info(f'testing set shape: {test.shape}')
        

In [10]:
# pipeline 
try: 
    config=ConfigurationManager()
    data_transformation_config=config.get_data_transformation_config()
    data_transform=DataTransformation(config=data_transformation_config)
    pipeline=data_transform.transform_data_pipeline()
    data_transform.train_test_spliting(pipeline)
except Exception as e:
    raise e 

[2025-03-21 19:56:45,337: INFO :common : yaml file: config\config.yaml loaded successfully]
[2025-03-21 19:56:45,342: INFO :common : yaml file: schema.yaml loaded successfully]
[2025-03-21 19:56:45,343: INFO :common : created directory at: artifacts]
[2025-03-21 19:56:45,344: INFO :common : created directory at: artifacts/data_transformation]
[2025-03-21 19:56:53,410: INFO :3914814864 : Transforming the data]
[2025-03-21 19:57:31,032: INFO :3914814864 : splitting the data into train and test set]
[2025-03-21 19:57:31,033: INFO :3914814864 : training set shape: (873747, 49)]
[2025-03-21 19:57:31,034: INFO :3914814864 : testing set shape: (291249, 49)]
