In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/mani/Desktop/mlops/datascienceproject'

In [4]:
import pandas as pd

data=pd.read_csv("artifacts/data_ingestion/winequality-red.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
data.isnull
data.shape

(1599, 12)

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
    root_dir:Path
    STATUS_FILE:str
    unzip_data_dir:Path
    all_schema:dict ## we are defining allschema to get all schema details in schema.yaml

In [9]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml, create_directories

In [13]:
# Definition of a class named ConfigurationManager, which is used to manage configurations.
class ConfigurationManager:

    # The constructor method (__init__) initializes an instance of the class with optional file paths for configuration.
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,  # Default file path for the main configuration file
        params_filepath = PARAMS_FILE_PATH,  # Default file path for the parameters file
        schema_filepath = SCHEMA_FILE_PATH   # Default file path for the schema file
    ):

        # Reads the configuration YAML file and stores its contents in self.config
        self.config = read_yaml(config_filepath)

        # Reads the parameters YAML file and stores its contents in self.params
        self.params = read_yaml(params_filepath)

        # Reads the schema YAML file and stores its contents in self.schema
        self.schema = read_yaml(schema_filepath)

        # Calls create_directories to ensure the artifacts root directory exists, as specified in the config.
        create_directories([self.config.artifacts_root])

    # Method to retrieve the data validation configuration as a DataValidationConfig object
    def get_data_validation_config(self) -> DataValidationConfig:

        # Accesses the data_validation section from the main config file, which contains specific settings for data validation
        config = self.config.data_validation

        # Accesses column schema information from the schema file, specifying the expected structure of the data
        schema = self.schema.COLUMNS

        # Ensures the root directory for data validation exists, creating it if necessary
        create_directories([config.root_dir])

        # Creates an instance of DataValidationConfig with settings required for data validation
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,            # Sets the root directory for data validation
            STATUS_FILE=config.STATUS_FILE,      # Path to a status file for tracking validation status
            unzip_data_dir=config.unzip_data_dir, # Directory where unzipped data will be stored
            all_schema=schema                    # Schema information to validate data columns
        )

        # Returns the data validation configuration object, which contains all required settings for validation tasks
        return data_validation_config


In [11]:
import os
from src.datascience import logger

In [None]:
# Definition of the DataValidation class, which handles data validation tasks.
class DataValidation:
    
    # Initialization method to set up the DataValidation object with configuration.
    def __init__(self, config: DataValidationConfig):  
        # `config` is an instance of DataValidationConfig, containing paths and schema for validation.
        self.config = config  # Assigns the config parameter to an instance variable for later use.

    # Method to validate if all columns in the data match the expected schema columns.
    def validate_all_columns(self) -> bool:  
        # Indicates that this method will return a boolean value (True or False).

        try:
            validation_status = None  # Initializes the variable to store validation result (True/False).

            # Reads the CSV file specified in the config's unzip_data_dir path into a DataFrame.
            data = pd.read_csv(self.config.unzip_data_dir)  

            # Converts the DataFrame's columns to a list for easy comparison.
            all_cols = list(data.columns)  

            # Retrieves the expected schema columns from the config for validation.
            all_schema = self.config.all_schema.keys()  

            # Iterates over each column in the CSV data.
            for col in all_cols:  
                # Checks if the current column is not in the schema.
                if col not in all_schema:  
                    # If a column is missing in the schema, set validation status to False.
                    validation_status = False  
                    
                    # Writes the validation status to a status file, indicating failure.
                    with open(self.config.STATUS_FILE, 'w') as f:  
                        f.write(f"Validation status: {validation_status}")  
                else:
                    # If the column is in the schema, set validation status to True.
                    validation_status = True  

                    # Writes the validation status to a status file, indicating success.
                    with open(self.config.STATUS_FILE, 'w') as f:  
                        f.write(f"Validation status: {validation_status}")  

            # Returns the final validation status (True if all columns are valid, otherwise False).
            return validation_status  

        # Catches any exceptions that occur during validation.
        except Exception as e:  
            # Raises the exception again for handling outside this method.
            raise e  
