In [1]:

import os
os.chdir("../")
import sys
from dataclasses import dataclass
from pathlib import Path
from src.constants.constants import CONFIG_PATH, PARAMS_PATH
from src.loging import logger
from src.utils.common import get_size, create_directories
from src.utils.common import read_yaml

In [2]:
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list

In [8]:
class ConfigurationManager:
    def __init__(self, config_path, params_path):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        
        create_directories([self.config["data_validation"]["root_dir"]])
    
    def get_data_validation_config(self):
        return DataValidationConfig(
            root_dir = Path(self.config["data_validation"]["root_dir"]),
            STATUS_FILE = self.config["data_validation"]["STATUS_FILE"],
            ALL_REQUIRED_FILES = self.config["data_validation"]["ALL_REQUIRED_FILES"]
        )
        

In [21]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        self.status_file = self.config.STATUS_FILE
        self.all_required_files = self.config.ALL_REQUIRED_FILES

    def _write_validation_status(self, filename, status):
        """Helper function to write validation status."""
        with open(self.status_file, "a") as f:
            f.write(f"Validation for {filename} is {status}\n")
            
    def validate_all_files(self):
        try:
            logger.info("Start: Validate all files")
            
            # Clear the status file before writing
            open(self.status_file, 'w').close()
            
            all_files = [file for file in self.config.root_dir.parent.glob("**/*") if file.is_file()]
           
            
            for file in all_files:
                if file.name in self.all_required_files:
                    validation_status = file.stat().st_size > 0
                    self._write_validation_status(file.name, validation_status)
                    
                    if not validation_status:
                        break
            
            logger.info("End: Validate all files")
            return validation_status

        except Exception as e:
            logger.error(f"Error during validation: {e}")
            return False


In [22]:
try:
    logger.info("Start: Data Validation")
    config_manager = ConfigurationManager(CONFIG_PATH, PARAMS_PATH)
    config = config_manager.get_data_validation_config()
    data_validation = DataValidation(config)
    data_validation.validate_all_files()
    logger.info("End: Data Validation")

except Exception as e:
    logger.error(f"Error: Data Validation failed: {e}")
    raise e

[2023-09-07 19:03:15,615]: INFO: 1842313794: Start: Data Validation]
[2023-09-07 19:03:15,621]: INFO: common: Successfully read yaml file from config\config.yaml]
[2023-09-07 19:03:15,625]: INFO: common: Successfully read yaml file from params.yaml]
[2023-09-07 19:03:15,628]: INFO: common: Created directory: data/data_validation]
[2023-09-07 19:03:15,630]: INFO: 1811115846: Start: Validate all files]
[WindowsPath('data/corpus.7z'), WindowsPath('data/README.md'), WindowsPath('data/data_ingestion/data.7z'), WindowsPath('data/data_ingestion/raw_data/licence.txt'), WindowsPath('data/data_ingestion/raw_data/README.txt'), WindowsPath('data/data_ingestion/raw_data/test.json'), WindowsPath('data/data_ingestion/raw_data/train.json'), WindowsPath('data/data_ingestion/raw_data/val.json'), WindowsPath('data/data_validation/status.txt')]
[2023-09-07 19:03:15,643]: INFO: 1811115846: End: Validate all files]
[2023-09-07 19:03:15,645]: INFO: 1842313794: End: Data Validation]
