In [1]:
import os

In [2]:
%pwd

'c:\\Users\\karth\\ML_practice\\Projects\\EndToEnd_MLOps_Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\karth\\ML_practice\\Projects\\EndToEnd_MLOps_Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [6]:
from src.ML_Project.constants import *
from src.ML_Project.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
        ):
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)
            create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
          config = self.config.data_transformation
          create_directories([config.root_dir])         

          data_transformation_config = DataTransformationConfig(
                root_dir= config.root_dir,
                data_path= config.data_path
          )
          return data_transformation_config

In [15]:
import os
import pandas as pd
import numpy as np
from src.ML_Project import logger
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle

In [23]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    """Note:- we can add any other data tranformations like Scaling, Encoding, PCA, etc..."""
 
    def transform(self, data: pd.DataFrame):
        # seperate target & features
        target = data[["selling_price"]]
        data.drop(["selling_price"], axis=1, inplace=True)

        # transform the "year" into "Car's-age" (age = current_year-year_of_car)
        current_year = 2025
        data["year"] = current_year - data["year"]
        data.rename(columns={"year":"age"}, inplace=True)

        num_cols = data.select_dtypes(include=["int64","float64"]).columns.tolist()
        cat_cols = data.select_dtypes(include=["object"]).columns.tolist()

        # initialize Standard-Scaler for features & target seperately
        feature_scaler = StandardScaler()
        data[num_cols] = feature_scaler.fit_transform(data[num_cols])

        target_scaler = StandardScaler()
        target["selling_price"] = target_scaler.fit_transform(target).reshape(-1,1)

        logger.info("Numerical columns scaled successfully!")

        # initialize One-Hot-Encoder
        encoder = OneHotEncoder(
            drop="first",
            dtype="int64",
            sparse_output=False, 
            handle_unknown="ignore"
        )
        # Perform Encoding on Categorical-columns
        encoded_cat_cols = encoder.fit_transform(data[cat_cols])
        encoded_cat_cols_df = pd.DataFrame(encoded_cat_cols, columns=encoder.get_feature_names_out())
        
        transformed_data = pd.concat([encoded_cat_cols_df, data[num_cols], target[["selling_price"]]], axis=1)

        logger.info("Categorical columns encoded successfully!")

        # save the Std-scaler as pickle file
        with open("artifacts/data_transformation/features_std_scaler.pkl", "wb") as scaler_file_path:
            pickle.dump(feature_scaler, scaler_file_path)
        with open("artifacts/data_transformation/target_std_scaler.pkl", "wb") as scaler_file_path:
            pickle.dump(target_scaler, scaler_file_path)

        logger.info("Saving both Std-scalers as pickle files at artifacts...")
 
        # save the OHE-encoder as pickle file
        with open("artifacts/data_transformation/OHE_encoder.pkl", "wb") as encoder_file_path:
            pickle.dump(encoder, encoder_file_path)
        logger.info("Saving OHE-encoder to pickle file at artifacts...")

        return transformed_data  



    def train_test_splitting(self):
        data = pd.read_csv(self.config.data_path)

        # transformations of data takes place before train-test split
        data = self.transform(data)
        
        # split the data into train & test sets (80%-20%)
        train, test = train_test_split(data, test_size=0.2, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Splitted dataset into training & test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)

In [27]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2025-03-23 23:13:58,360: INFO: common: Yaml file:config\config.yaml loaded successfully]
[2025-03-23 23:13:58,362: INFO: common: Yaml file:params.yaml loaded successfully]
[2025-03-23 23:13:58,364: INFO: common: Yaml file:schema.yaml loaded successfully]
[2025-03-23 23:13:58,366: INFO: common: created directory at: artifacts]
[2025-03-23 23:13:58,367: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-23 23:13:58,380: INFO: 4269204336: Numerical columns scaled successfully!]
[2025-03-23 23:13:58,389: INFO: 4269204336: Categorical columns encoded successfully!]
[2025-03-23 23:13:58,391: INFO: 4269204336: Saving both Std-scalers as pickle files at artifacts...]
[2025-03-23 23:13:58,393: INFO: 4269204336: Saving OHE-encoder to pickle file at artifacts...]
[2025-03-23 23:13:58,428: INFO: 4269204336: Splitted dataset into training & test sets]
[2025-03-23 23:13:58,429: INFO: 4269204336: (3472, 42)]
[2025-03-23 23:13:58,430: INFO: 4269204336: (868, 42)]
(3472, 42)
(