In [1]:
import os
import sys

In [2]:
os.chdir('../')

In [3]:
sys.path.append(os.path.join(os.getcwd(), "src"))

In [4]:
from WattPredictor.utils.helpers import *
from WattPredictor.constants import *
from WattPredictor.utils.exception import *
from WattPredictor.utils.logging import logger
from pathlib import Path
from dataclasses import dataclass
import pandas as pd

In [5]:
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    data_file: Path
    status_file: Path
    all_schema: dict

In [6]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath=CONFIG_PATH, 
        params_filepath=PARAMS_PATH, 
        schema_filepath=SCHEMA_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.columns
        
        create_directories([config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            status_file=config.status_file,
            data_file=config.data_file,
            all_schema=schema,
        )
        return data_validation_config

In [7]:
import os
import json
import pandas as pd
from WattPredictor.utils.logging import logger
from WattPredictor.entity.config_entity import DataValidationConfig
from WattPredictor.utils.helpers import create_directories
from WattPredictor.utils.exception import CustomException

class Validation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_data_types(self, data: pd.DataFrame, schema: dict):

        type_mapping = {
            'int': ['int64', 'int32'],
            'float': ['float64', 'float32'],
            'object': ['object'],
            'str': ['object'], 
        }

        for col, expected_type in schema.items():
            if col not in data.columns:
                continue 
                
            actual_dtype = str(data[col].dtype)
            allowed_dtypes = type_mapping.get(expected_type, [expected_type])

            if actual_dtype not in allowed_dtypes:
                logger.error(f"Column '{col}': Expected type '{expected_type}', got '{actual_dtype}'")
                return False
        return True

    def validate_column_presence(self, data: pd.DataFrame, schema: dict):
        all_cols = list(data.columns)
        expected_cols = set(schema.keys())
        missing_cols = expected_cols - set(all_cols)

        if missing_cols:
            logger.error(f"Missing columns: {missing_cols}")
            return False
        return True
    
    def check_missing_values(self, data: pd.DataFrame) -> bool:
        missing = data.isnull().sum()
        if missing.any():
            logger.error(f"Missing values detected:\n{missing[missing > 0]}")
            return False
        return True

    def check_duplicates(self, data: pd.DataFrame) -> bool:
        duplicates = data.duplicated().sum()
        if duplicates > 0:
            logger.warning(f"Duplicate rows detected: {duplicates}. Removing...")
            data.drop_duplicates(inplace=True)
            logger.info(f"{duplicates} duplicate rows removed.")
            data.to_csv(self.config.data_file, index=False)
        return True


    def validator(self):
        data = pd.read_csv(self.config.data_file)
        schema = self.config.all_schema

        logger.info(f"Starting validation for data with shape: {data.shape}")
            
        validation_results = {}
            
        validation_results = {
            'column_presence': self.validate_column_presence(data, schema),
            'data_types': self.validate_data_types(data, schema),
            'missing_values': self.check_missing_values(data),
            'duplicates': self.check_duplicates(data)
        }
            
        is_valid = all(validation_results.values())

        create_directories([os.path.dirname(self.config.status_file)])

        for check, result in validation_results.items():
            logger.info(f"{check}: {'PASSED' if result else 'FAILED'}")

        logger.info(f"Overall validation status: {'PASSED' if is_valid else 'FAILED'}")

        with open(self.config.status_file, 'w') as f:
            json.dump({"validation_status": is_valid}, f, indent=4)

        return is_valid

In [8]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = Validation(data_validation_config)
    data_validation.validator()
except Exception as e:
    raise CustomException(str(e), sys) from e

[2025-07-16 11:45:27,663: INFO: helpers: yaml file: config_file\config.yaml loaded successfully]
[2025-07-16 11:45:27,670: INFO: helpers: yaml file: config_file\params.yaml loaded successfully]
[2025-07-16 11:45:27,676: INFO: helpers: yaml file: config_file\schema.yaml loaded successfully]
[2025-07-16 11:45:27,679: INFO: helpers: created directory at: artifacts]
[2025-07-16 11:45:27,679: INFO: helpers: created directory at: artifacts/data_validation]
[2025-07-16 11:45:27,796: INFO: 4265189487: Starting validation for data with shape: (41184, 13)]
[2025-07-16 11:45:27,897: INFO: 4265189487: 1639 duplicate rows removed.]
[2025-07-16 11:45:28,168: INFO: helpers: created directory at: artifacts/data_validation]
[2025-07-16 11:45:28,168: INFO: 4265189487: column_presence: PASSED]
[2025-07-16 11:45:28,168: INFO: 4265189487: data_types: PASSED]
[2025-07-16 11:45:28,168: INFO: 4265189487: missing_values: PASSED]
[2025-07-16 11:45:28,168: INFO: 4265189487: duplicates: PASSED]
[2025-07-16 11:45: