# data validation 
- datatypes 
- data drift 
- same schema
- validate column


In [6]:
# constants 
""" 
Data Validation related constants 
"""
DATA_VALIDATION_DIR_NAME: str = 'data_validation'
DATA_VALIDATION_VALID_DIR: str = 'validated'
DATA_VALIDATION_INVALID_DIR: str = 'invalid'
DATA_VALIDATION_DRIFT_REPORT_DIR: str = 'drift_report'
DATA_VALIDATION_DRIFT_REPRT_FILE_NAME: str = 'report.yaml'

In [None]:
# entity 
from dataclasses import dataclass 

@dataclass
class DataValidationArtifiact:
    valdation_status: bool
    valid_train_file_path: str 
    valid_test_file_path: str 
    invalid_train_file_path: str 
    invalid_test_file_path: str 
    drift_report_filepath: str 


In [None]:
# config entity 
from NetworkSecurity.constants import training_pipeline
from NetworkSecurity.config.configuration import TrainingPipelineConfig
import os 

class DataValidationConfig: 
    def __init__(self,training_pipeline_config:TrainingPipelineConfig):
        self.data_validation_dir: str = os.path.join(training_pipeline_config.artifact_dir,training_pipeline.DATA_VALIDATION_DIR_NAME)
        self.valid_data_dir: str = os.path.join(self.data_validation_dir,training_pipeline.DATA_VALIDATION_VALID_DIR)
        self.invalid_data_dir: str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_INVALID_DIR)
        self.valid_train_file_path: str =os.path.join(self.valid_data_dir, training_pipeline.TRAIN_FILE_NAME)
        self.valid_test_file_path: str = os.path.join(self.valid_data_dir, training_pipeline.TEST_FILE_NAME)
        self.invalid_train_file_path: str = os.path.join(self.invalid_data_dir, training_pipeline.TRAIN_FILE_NAME)
        self.invalid_test_file_path: str = os.path.join(self.invalid_data_dir, training_pipeline.TEST_FILE_NAME)
        self.drift_report_file_path: str = os.path.join(self.data_validation_dir, 
                                                        training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR, 
                                                        training_pipeline.DATA_VALIDATION_DRIFT_REPRT_FILE_NAME
                                                        )

In [8]:
# create schema.yaml file

In [None]:
# component 
from NetworkSecurity.entity.artifact_entity import DataIngestionArtifact, DataValidationAritifact
from NetworkSecurity.entity.config_entity import DataIngestionConfig
from NetworkSecurity.exception.exception import NetworkSecurityException
from NetworkSecurity.logging.logger import logging
from scipy.stats import ks_2samp # check 2 samples for data drift 
from NetworkSecurity.constants.training_pipeline import SCHEMA_FILE_PATH
from NetworkSecurity.utils.main_utils.utils import read_yaml_file, write_yaml_file
import pandas as pd 
import os, sys 
from NetworkSecurity.components.data_ingestion import DataIngestion

class DataValidation:
    def __init__(self, data_ingestion_artifact: DataIngestionArtifact,
                 data_validation_config: DataValidationConfig):
        try:
            self.data_ingestion_artifact=data_ingestion_artifact
            self.data_validation_config=data_validation_config
            self._schema_config=read_yaml_file(SCHEMA_FILE_PATH)
        except Exception as e:
            raise NetworkSecurityException(e,sys)



    @staticmethod
    def read_data(file_path)->pd.DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise NetworkSecurityException(e,sys)


    
    def validate_numbers_of_columns(self,dataframe:pd.DataFrame)->bool:
        try:
            number_of_columns=len(self._schema_config) 
            logging.info(f'Required number of columns {number_of_columns}')
            logging.info(f'Data frame has columns: {len(dataframe.columns)}')
            if len(dataframe.columns)==number_of_columns:
                return True 
            return False 
        except Exception as e:
            raise NetworkSecurityException(e,sys)

    
    
    def detect_dataset_drift(self,base_df,current_df,threshold=0.05)-> bool:
        try:
            satus=True
            report={}
            for column in base_df.columns:
                d1=base_df[column]
                d2=current_df[column]
                is_same_dist=ks_2samp(d1,d2)
                if threshold<=is_same_dist.pvalue:
                    is_found=False
                else:
                    is_found=True 
                report.update(
                    {column:{
                        'p_value':float(is_same_dist.pvalue),
                        'drift_status':is_found
                    }}
                )
            drift_report_file_path = self.data_validation_config.drift_report_file_path
            
            # create directory
            dir_path= os.path.dirname(drift_report_file_path)
            os.makedirs(dir_path,exist_ok=True)
            write_yaml_file(file_path=drift_report_file_path,content=report)

        except Exception as e:
            raise NetworkSecurityException(e,sys)



    
    def initiate_data_validation(self)-> DataValidationAritifact:
        try:
            train_file_path=self.data_ingestion_artifact.trained_file_path
            test_file_path=self.data_ingestion_artifact.test_file_path

            ## read the data from train and etst 
            train_dataframe=DataValidation.read_data(train_file_path)
            test_dataframe=DataValidation.read_data(test_file_path)

            # validate number of columns 
            status=self.validate_numbers_of_columns(dataframe=train_dataframe)
            if not status:
                error_message=f'{error_message} Train dataframe does not contain all columns \n'
            
            status=self.validate_numbers_of_columns(dataframe=test_dataframe)
            if not status:
                error_message=f'{error_message} Test dataframe does not contain all columns \n'

            ## check for numerical columns

            ## check for data drift 
            status=self.detect_dataset_drift(base_df=train_dataframe,current_df=test_dataframe)
            dir_path=os.path.dirname(self.data_validation_config.valid_train_file_path)
            os.mkdirs(dir_path,exist_ok=True)

            train_dataframe.to_csv(self.data_validation_config.valid_train_file_path, index=False, header=True)
            test_dataframe.to_csv(self.data_validation_config.valid_test_file_path, index=False, header=True)

            data_validation_artifact=DataValidationAritifact(
                validation_status=status,
                valid_train_file_path=self.data_ingestion_artifact.trained_file_path,
                valid_test_file_path=self.data_ingestion_artifact.test_file_path,
                invalid_train_file_path=None,
                invalid_test_file_path=None,
                drift_report_file_path=self.data_validation_config.drift_report_file_path
            )
            return data_validation_artifact
        except Exception as e:
            raise NetworkSecurityException(e,sys)
        
    



In [14]:
try: 
    trainingpipelineconfig=TrainingPipelineConfig()
    dataingestionconfig=DataIngestionConfig(trainingpipelineconfig)
    data_ingestion=DataIngestion(dataingestionconfig)
    logging.info('Initiate the data ingesion')
    dataingestionartifact=data_ingestion.initiate_data_ingestion()
    logging.info('data initiation completed')
    print(dataingestionartifact)
    data_validation_config=DataValidationConfig(trainingpipelineconfig)
    data_validation=DataValidation(dataingestionconfig,data_validation_config)
    logging.info('Initiate the data validation')
    data_validation_artifact=data_validation.initiate_data_validation()
    logging.info('Data validation completed')
    print(data_validation_artifact)
except Exception as e:
    raise NetworkSecurityException(e,sys)

DataIngestionArtifact(trained_file_path='Artifacts\\05_14_2025_11_51_52\\data_ingestion\\ingested\\train.csv', test_file_path='Artifacts\\05_14_2025_11_51_52\\data_ingestion\\ingested\\test.csv')


NetworkSecurityException: Error occured in python script name [C:\Users\26amr\AppData\Local\Temp\ipykernel_24348\3967426075.py] line number [10] error message [Error occured in python script name [C:\Users\26amr\AppData\Local\Temp\ipykernel_24348\685533584.py] line number [19] error message [Error occured in python script name [d:\pythonprojects\networksecurity\src\NetworkSecurity\utils\main_utils\utils.py] line number [12] error message [[Errno 2] No such file or directory: 'data_schema\\schema.yaml']]]