In [1]:
import os

In [2]:
%pwd

'f:\\Machine_learning_end_to_end\\machine_learning\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'f:\\Machine_learning_end_to_end\\machine_learning'

In [24]:
from dataclasses import dataclass
from pathlib import Path

In [70]:
@dataclass
class DataTransformationConfig:
    root_dir:Path
    data_path:Path
    pkl_path:Path

In [71]:
from src.constant import *
from src.utils.common import read_yaml

In [72]:
from src.components import data_transformation


class ConfigurationManager:
    def __init__(self,
                        config_path=CONFIG_FILE_PATH,
                        schema_path=SCHEMA_FILE_PATH,
                        params_path=PARAMS_FILE_PATH):
        self.config=read_yaml(config_path)
        self.schema=read_yaml(schema_path)
        self.params=read_yaml(params_path)

    def data_transformation_config(self)->DataTransformationConfig:
        config=self.config.data_transformation
        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            pkl_path=config.pkl_path
        )

        return data_transformation_config

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from src import logger
from pathlib import Path
from src.utils.common import save_bin

class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config=config

    def data_transformation(self,data):
        logger.info("data transformation object start creating \n")
        column=data.columns
        column_save=column.drop("quality")

        p1=Pipeline(steps=[
        ('standard_scaler',StandardScaler())
        ])

        ct=ColumnTransformer(transformers=[
        ('std_scaler',p1,column_save)
        ])
        logger.info("data Transformation object creation completed ")
        return ct
    

    def train_test_split(self):
        data=pd.read_csv(self.config.data_path)

        logger.info("data are splitted into train and test")

        train_data,test_data=train_test_split(data,test_size=0.2,random_state=5)
        logger.info(f"train data shape {train_data.shape}")
        logger.info(f"test data shape {test_data.shape}")

        logger.info(f"train data column : {list(train_data.columns)}\n")
        logger.info(f"test data column {list(test_data.columns)}\n")

        logger.info("train,test data saved to path")
        train_data.to_csv(os.path.join(self.config.root_dir,"train.csv"),index=False)
        test_data.to_csv(os.path.join(self.config.root_dir,"test.csv"),index=False)

        logger.info("preprocessor object")
        preproc_obj=self.data_transformation(train_data)

        TARGET_COLUMN='quality'
        logger.info("splitting target column from train data")
        train_df=train_data.drop(TARGET_COLUMN,axis=1)
        target_train=train_data[TARGET_COLUMN]
        logger.info("Target column splitted from train data")

        logger.info("splitting target column from test data")
        test_df=test_data.drop(TARGET_COLUMN,axis=1)
        target_test=test_data[TARGET_COLUMN]
        logger.info("target column splitted from test data")

        logger.info("Transform data")
        train_trans=preproc_obj.fit_transform(train_df)
        test_trans=preproc_obj.transform(test_df)
        logger.info("save preprocessor object")
        path=Path(self.config.pkl_path)
        save_bin(preproc_obj,path)

In [76]:
obj=ConfigurationManager()
data_transform=obj.data_transformation_config()

obj2=DataTransformation(data_transform)
obj2.train_test_split()

[2024-11-13 20:23:39,519 : INFO : common : yaml file path : config\config.yaml loaded successfully:]
[2024-11-13 20:23:39,543 : INFO : common : yaml file path : schema.yaml loaded successfully:]
[2024-11-13 20:23:39,551 : INFO : common : yaml file path : params.yaml loaded successfully:]
[2024-11-13 20:23:40,012 : INFO : 958572087 : data are splitted into train and test]
[2024-11-13 20:23:40,147 : INFO : 958572087 : train data shape (914, 12)]
[2024-11-13 20:23:40,150 : INFO : 958572087 : test data shape (229, 12)]
[2024-11-13 20:23:40,153 : INFO : 958572087 : train data column : ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
]
[2024-11-13 20:23:40,156 : INFO : 958572087 : test data column ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alc

[2024-11-13 20:23:40,794 : INFO : 958572087 : data Transformation object creation completed ]
[2024-11-13 20:23:40,797 : INFO : 958572087 : splitting target column from train data]
[2024-11-13 20:23:41,129 : INFO : 958572087 : Target column splitted from train data]
[2024-11-13 20:23:41,134 : INFO : 958572087 : splitting target column from test data]
[2024-11-13 20:23:41,140 : INFO : 958572087 : target column splitted from test data]
[2024-11-13 20:23:41,145 : INFO : 958572087 : Transform data]
[2024-11-13 20:23:41,753 : INFO : 958572087 : save preprocessor object]
[2024-11-13 20:23:42,005 : INFO : common : binary file save to : data\processed\prepro.pkl]
