In [None]:
#Import necessary libraries
import os
import pandas as pd
from dataclasses import dataclass
from pathlib import Path
from src.constants import *
from src.utils.common import read_yaml, create_directories
from src.logging import logger


In [None]:
#Get the current working directory
%pwd

In [None]:
#Change directory to parent directory
os.chdir("../")
%pwd

In [None]:
#Configuration class for Data Validation component
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    status_file: str
    all_schema: dict  #Schema for data validation (schema.yaml) -> Need to read the schema file

In [None]:
#Configuration Manager class to handle configurations
class ConfigurationManager:

    #Initialization method to read config, params, and schema files
    def __init__(
        self,
        config_filePath = CONFIG_FILE_PATH,
        params_filePath = PARAMS_FILE_PATH,
        schema_filePath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filePath)
        self.params = read_yaml(params_filePath)
        self.schema = read_yaml(schema_filePath)

        create_directories([self.config.artifacts_root])

    #Method to get Data Validation configuration
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            unzip_data_dir = config.unzip_data_dir,
            status_file = config.status_file,
            all_schema = schema
        )
        return data_validation_config
        

In [None]:
#Data validation class to handle data validation tasks (Data Validation component)
class DataValidation:

    #Initialization method to set up configuration
    def __init__(self, config: DataValidationConfig):
        self.config = config

    #Method to validate all columns in the dataset
    def validate_all_columns(self) -> bool:
        try:
            validatation_status = None
            
            data = pd.read_csv(self.config.unzip_data_dir)
            all_columns = list(data.columns)

            all_schema = self.config.all_schema.keys()

            for column in all_columns:
                if column not in all_schema:
                    validatation_status = False
                    with open(self.config.status_file, 'w') as f:
                        f.write(f"Validation Status: {validatation_status}\n")
                else:
                    validatation_status = True
                    with open(self.config.status_file, 'w') as f:
                        f.write(f"Validation Status: {validatation_status}\n")

            return validatation_status
        
        except Exception as e:
            raise e

In [None]:
#Pipeline to execute data validation
try:
    config = ConfigurationManager()  #Initialize configuration manager
    data_validation_config = config.get_data_validation_config()  #Get data validation configuration
    data_validation = DataValidation(config= data_validation_config)  #Initialize data validation
    data_validation.validate_all_columns()  #Validate all columns in the dataset
except Exception as e:
    logger.exception(e)