In [1]:
import os
import sys

In [2]:
pwd%%

'f:\\Files\\DS&ML\\FraudGuard\\notebooks'

In [3]:
os.chdir('../')

In [4]:
pwd%%

'f:\\Files\\DS&ML\\FraudGuard'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_file: Path
    status_file: Path
    all_schema: dict

In [None]:
from FraudGuard.constants import *
from FraudGuard.utils.helpers import *
from FraudGuard.utils.exception import CustomException

In [8]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath=CONFIG_PATH, 
        params_filepath=PARAMS_PATH, 
        schema_filepath=SCHEMA_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        
        create_directories([config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            status_file=config.status_file,
            unzip_file=config.unzip_file,
            all_schema=schema,
        )
        return data_validation_config

In [9]:
import pandas as pd

In [10]:
import pandas as pd
from project import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data_types(self, data: pd.DataFrame, schema: dict) -> bool:
        """Validates the data types of columns against the schema."""
        type_mapping = {
            'int': ['int64', 'int32'],
            'float': ['float64', 'float32'],
            'object': ['object'],
            'str': ['object'], 
        }

        for col, expected_type in schema.items():
            if col not in data.columns:
                continue 
                
            actual_dtype = str(data[col].dtype)
            allowed_dtypes = type_mapping.get(expected_type, [expected_type])

            if actual_dtype not in allowed_dtypes:
                logger.error(f"Column '{col}': Expected type '{expected_type}', got '{actual_dtype}'")
                return False
        return True

    def validate_column_presence(self, data: pd.DataFrame, schema: dict) -> bool:
        """Validates that all required columns are present in the data."""
        all_cols = list(data.columns)
        expected_cols = set(schema.keys())
        missing_cols = expected_cols - set(all_cols)

        if missing_cols:
            logger.error(f"Missing columns: {missing_cols}")
            return False
        return True


    def validation(self) -> bool:
            data = pd.read_csv(self.config.unzip_file, low_memory=False)
            schema = self.config.all_schema

            logger.info(f"Starting validation for data with shape: {data.shape}")
            
            validation_results = {}
            
            # Run all validation checks
            validation_results['column_presence'] = self.validate_column_presence(data, schema)
            validation_results['data_types'] = self.validate_data_types(data, schema)
            
            # Overall validation status
            is_valid = all(validation_results.values())
            
            # Log validation results
            for check, result in validation_results.items():
                logger.info(f"{check}: {'PASSED' if result else 'FAILED'}")
            
            logger.info(f"Overall validation status: {'PASSED' if is_valid else 'FAILED'}")
            
            # Write status to file
            with open(self.config.status_file, 'w') as f:
                f.write(f"Validation_status: {is_valid}")
                
            return is_valid

In [11]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validation()
except Exception as e:
    raise CustomException(str(e), sys)

[2025-05-26 16:22:21,769: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-05-26 16:22:21,773: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-05-26 16:22:21,780: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-05-26 16:22:21,782: INFO: helpers: created directory at: artifacts]
[2025-05-26 16:22:21,784: INFO: helpers: created directory at: artifacts/data_validation]
[2025-05-26 16:22:21,875: INFO: 3317476208: Starting validation for data with shape: (51000, 12)]
[2025-05-26 16:22:21,875: INFO: 3317476208: column_presence: PASSED]
[2025-05-26 16:22:21,875: INFO: 3317476208: data_types: PASSED]
[2025-05-26 16:22:21,875: INFO: 3317476208: Overall validation status: PASSED]
