# Data validation

In this script we will perform data validation in our data pipeline. Data validation is the process of ensuring that the data we are working with is correct.

We first need to make sure that we are working in the correct directory, we want the main directory to be `mlopsProject`. Make sure to run this only once on your local machine, or restart the kernel if you want to rerun all

In [1]:
import os

Assuming that `02_data_validation.ipynb` is in `mlopsProject/research`

In [2]:
os.chdir('../')

current_path = os.getcwd() 
print(current_path) # Should be /mlopsProject

/home/corti/Desktop/mlopsProject


In [3]:
from dataclasses import dataclass
from pathlib import Path

In [4]:
@dataclass(frozen = True)
class DataValidationConfig:
    root_dir : Path # The root directory for data validation
    STATUS_FILE : str  # Status file to check if data validation was successful
    ALL_REQUIRED_FILES : list # List of all required files

In [5]:
from ConversationSummarizer.constants import *
from ConversationSummarizer.utils.common import read_yaml, create_directories

In [6]:
# Define a class for managing configurations
class ConfigurationManager:
    # Initialize the ConfigurationManager with paths to the configuration and parameters files
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        # Read the configuration and parameters files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Create the root directory for storing artifacts
        create_directories([self.config.artifacts_root])
        
    # Define a method for getting the data validation configuration
    def get_data_validation_config(self) -> DataValidationConfig:
        
        # Get the data validation configuration from the config file
        config = self.config.data_validation
        
        # Create the root directory for data validation, if it doesn't already exist
        create_directories([config.root_dir])
        
        # Create a DataIngestionConfig object with the configuration values
        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            ALL_REQUIRED_FILES = config.ALL_REQUIRED_FILES
        )
        
        # Return the DataValidationConfig object
        return data_validation_config

In [7]:
import os
from ConversationSummarizer.logging import logger

In [8]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_files_exists(self) -> bool:
        try:
            # Get the list of all files in the specified directory
            all_files = os.listdir((os.path.join("artifacts","data_ingestion","samsum_dataset")))

            # Check if all required files are present
            missing_files = [file for file in self.config.ALL_REQUIRED_FILES if file not in all_files]

            # If there are missing files, validation fails
            validation_status = len(missing_files) == 0

            # Write the validation status to the status file
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            # Log the error message and the stack trace
            logger.error(f"An error occurred during file validation: {e}", exc_info=True)
            raise
        

In [9]:
try:
    # Instantiate ConfigurationManager and get the data validation configuration
    config = ConfigurationManager() 
    data_validation_config = config.get_data_validation_config()

    # Instantiate DataValidation with the configuration and perform data validation
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_all_files_exists()

except Exception as e:
    # Log the exception before raising it
    logger.error(f"An error occurred during data validation: {str(e)}")
    raise

[2024-01-31 23:03:58,634: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-31 23:03:58,637: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-31 23:03:58,640: INFO: common: created directory at: artifacts]
[2024-01-31 23:03:58,644: INFO: common: created directory at: artifacts/data_validation]
