# data transformation 

In [1]:
# constants 
import numpy as np

""" 
Data Transformation related constant start with DATA_TRANSFORMATION VAR NAME 
"""
DATA_TRANSFORMATION_DIR_NAME: str = "data_transformation"
DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR: str = 'transformed'
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR: str = "transformed_object"
PREPROCESSING_OBJECT_FILE_NAME: str = 'preprocessing.pkl'
## KNN imputer to replace nan values 
DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
    'missing_Values': np.nan,
    'n_neighbors': 3,
    'weights': 'uniform'
}

In [2]:
# artifact entity
from dataclasses import dataclass 

@dataclass
class DataTransformationArtifact:
    transformed_object_file_path: str 
    transformed_train_file_path: str 
    transformed_test_file_path: str 


In [3]:
# config 
from NetworkSecurity.entity.config_entity import TrainingPipelineConfig
from NetworkSecurity.constants import training_pipeline
import os 


class DataTransformationConfig:
    def __init__(self, training_pipeline_config:TrainingPipelineConfig):
        self.data_transformation_dir: str = os.path.join(training_pipeline_config.artifact_dir, training_pipeline.DATA_TRANSFORMATION_DIR_NAME)
        self.transformed_train_file_path: str = os.path.join(self.data_transformation_dir,training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
                                                                     training_pipeline.TRAIN_FILE_NAME.replace('csv','npy'),)
        self.transformed_test_file_path: str = os.path.join(self.data_transformation_dir,training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
                                                            training_pipeline.TEST_FILE_NAME.replace('csv','npy'),)
        self.transformed_object_file_path: str = os.path.join(self.data_transformation_dir,training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR,
                                                            training_pipeline.PREPROCESSING_OBJECT_FILE_NAME,)
        

NetworkSecurity
Artifacts


In [4]:
# component 
import sys 
import os 
import numpy as np 
import pandas as pd 
from sklearn.impute import KNNImputer
from sklearn.pipeline import  Pipeline

from NetworkSecurity.constants.training_pipeline import TARGET_COLUMN
from NetworkSecurity.constants.training_pipeline import DATA_TRANSFORMATION_IMPUTER_PARAMS

from NetworkSecurity.entity.artifact_entity import (
    DataTransformationArtifact,
    DataValidationAritifact
)

# from NetworkSecurity.entity.config_entity import Data 
from NetworkSecurity.exception.exception import NetworkSecurityException
from NetworkSecurity.logging.logger import logging
from NetworkSecurity.utils.main_utils.utils import save_numpy_array_data,save_object


class DataTransformation:
    def __init__(self, data_validation_artifact: DataValidationAritifact,
                 data_transformation_config: DataTransformationConfig
                 ):
        try:
            self.data_validation_artifact:DataValidationAritifact=data_validation_artifact
            self.data_transformation_config:DataTransformationConfig=data_transformation_config
            
        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
    @staticmethod
    def read_data(file_path)-> pd.DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise NetworkSecurityException(e,sys)

    def get_data_transformer_object(cls)->Pipeline:
        """ 
        It initialises a KKNN Imputer object with the parameters specified in the training_pipeline.py file
        and returns a pipeline object with the KNNImputer object as the first step 

        Args:
            cls: DataTransformation 

        Returns:
            A pipeline object
        """
        logging.info(
            'Entered get_data_tranformer_object method of Transformation calass'
        )
        try:
            imputer:KNNImputer=KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
            logging.info(f'Initialise KNNImputer with {DATA_TRANSFORMATION_IMPUTER_PARAMS}')
            processor:Pipeline=Pipeline([('imputer',imputer)])
            return processor

        except Exception as e:
            raise NetworkSecurityException(e,sys)

    def initiate_data_transformation(self)-> DataTransformationArtifact: 
        logging.info('Entered initiate_date_transformation method of DataTransformation class')
        try:
            logging.info('Starting data transformation')
            train_df=DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
            test_df=DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)

            # remove target variable
            # training dataframe 
            input_feature_train_df=train_df.drop(columns=[TARGET_COLUMN],axis=1)
            target_feature_train_df=train_df[TARGET_COLUMN]
            target_feature_train_df=target_feature_train_df.replace(-1,0)

            # testing dataframe 
            input_feature_test_df=test_df.drop(columns=[TARGET_COLUMN],axis=1)
            target_feature_test_df=test_df[TARGET_COLUMN]
            target_feature_test_df=target_feature_test_df.replace(-1,0)

            # create preprocessor object 
            preprocessor=self.get_data_transformer_object()
            preprocessor_object=preprocessor.fit(input_feature_train_df)
            transformed_input_train_feature=preprocessor_object.transform(input_feature_train_df)
            transformed_input_test_feature=preprocessor_object.transform(input_feature_test_df)

            train_arr=np.c_[transformed_input_train_feature,np.array(target_feature_train_df)]
            test_arr=np.c_[transformed_input_test_feature,np.array(target_feature_test_df)]

            # save numpy array data 
            save_numpy_array_data(self.data_transformation_config.transformed_train_file_path, array=train_arr,)
            save_numpy_array_data(self.data_transformation_config.transformed_test_file_path, array=test_arr,)
            save_object(self.data_transformation_config.transformed_object_file_path, preprocessor_object, )

            # preparing artifacts 
            data_transformation_artifact=DataTransformationArtifact(
                transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
                transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
                transformed_test_file_path=self.data_transformation_config.transformed_test_file_path
            )
            return data_transformation_artifact


        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
        
        
        

In [None]:
# main

if __name__=='__main__':
    try:
        logging.info('Data Transformation config')
        data_transformation_config=DataTransformationConfig(train)
    except Exception as e:
        raise NetworkSecurityException(e,sys)