In [1]:
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow'

#### 1. Update config.yaml

- Open config/config.yaml file and add data_validation configuration

#### 2. Update schema.yaml

* Contains:
- How many columns your data is containing
- Name and dtype

In [3]:
import pandas as pd

loan_df = pd.read_csv('artifacts/data_ingestion/Loan_Default.csv')
loan_df_copy = loan_df.copy()


In [4]:
loan_df['Credit_Score'].dtype

dtype('int64')

In [5]:
for col in loan_df.columns:
    print(f'{col}: {loan_df[col].dtype}')

ID: int64
year: int64
loan_limit: object
Gender: object
approv_in_adv: object
loan_type: object
loan_purpose: object
Credit_Worthiness: object
open_credit: object
business_or_commercial: object
loan_amount: int64
rate_of_interest: float64
Interest_rate_spread: float64
Upfront_charges: float64
term: float64
Neg_ammortization: object
interest_only: object
lump_sum_payment: object
property_value: float64
construction_type: object
occupancy_type: object
Secured_by: object
total_units: object
income: float64
credit_type: object
Credit_Score: int64
co-applicant_credit_type: object
age: object
submission_of_application: object
LTV: float64
Region: object
Security_Type: object
Status: int64
dtir1: float64


## 3) Update entity

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidatioConfig:
    """
    Configuration class for data validation.

    This dataclass stores the configuration details required for validating data.
    The `frozen=True` parameter ensures that instances of this class are immutable.

    Attributes:
        root_dir (Path): The root directory where data and related files are stored.
        unzip_data_dir (Path): Directory where the unzipped data files are located.
        STATUS_FILE (str): Name or path of the file that tracks the status of the data validation process.
        all_schema (dict): Dictionary containing the schema details for data validation.
                           Typically includes keys and expected data types for the datasets.
    """
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: str
    all_schema: dict
    

## 4) Update the configuration manager in src config

In [7]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    """
    Manages the configuration and setup for the project.

    This class is responsible for reading configuration files, creating required directories, 
    and providing specific configuration objects needed for various components of the project.

    Attributes:
        config (dict): Parsed content of the main configuration file.
        params (dict): Parsed content of the parameters file.
        schema (dict): Parsed content of the schema file.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
        ):
        """
        Initializes the ConfigurationManager.

        Reads YAML configuration files for main configuration, parameters, and schema. 
        Also ensures that the root artifacts directory specified in the configuration is created.

        Args:
            config_filepath (str): Path to the main configuration YAML file. Default is `CONFIG_FILE_PATH`.
            params_filepath (str): Path to the parameters YAML file. Default is `PARAMS_FILE_PATH`.
            schema_filepath (str): Path to the schema YAML file. Default is `SCHEMA_FILE_PATH`.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_validation_config(self) -> DataValidatioConfig:
        """
        Creates and returns a `DataValidatioConfig` object for data validation.

        This method retrieves the data validation-specific configuration from the main 
        configuration and schema files. It also ensures the directories required for 
        data validation are created.

        Returns:
            DataValidatioConfig: An instance of `DataValidatioConfig` initialized with 
            the appropriate paths and schema information.
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        
        create_directories([config.root_dir])
        
        data_validation_config = DataValidatioConfig(
            root_dir=config.root_dir,
            unzip_data_dir=config.unzip_data_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema
        )
        return data_validation_config

# 6. Update the components

In [9]:
import os
from mlproject import logger
from typing import Tuple

In [None]:
class DataValidation:
    """Performs data validation checks based on the provided configuration.

    This class validates the columns and data types of a dataset against a predefined schema. 
    It checks whether all expected columns are present and whether their data types match the schema.
    
    """
    def __init__(self, config: DataValidatioConfig):
        """
        Initializes the DataValidation object with the provided configuration.

        Args:
            config (DataValidatioConfig): The configuration object containing the schema 
                                          and file paths for data validation.
        """
        self.config = config
        
        
    def validate_all_columns(self) -> Tuple[bool, bool, bool]:
        """
        Validates the dataset against the predefined schema.

        This method checks:
            1. If all columns in the dataset match the schema.
            2. If the data types of the columns in the dataset match the expected types in the schema.
            3. If there are duplicate rows in the dataset.

        The validation results are written to the status file specified in the configuration.

        Returns:
            Tuple[bool, bool, bool]: A tuple containing three boolean values:
            - First value: True if all columns match the schema, False otherwise.
            - Second value: True if all data types match the schema, False otherwise.
            - Third value: True if no duplicate rows exist, False otherwise.

        Raises:
            Exception: If any error occurs during the validation process.
        """
        try:
            validation_status_col = None
            validation_status_dtype = None
            validation_status_duplicates = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)
            all_dtypes = list(data.dtypes)
            
            all_schema = self.config.all_schema.keys()
            all_schema_dtypes = self.config.all_schema.values()
            
            # Validate columns
            for col in all_cols:
                if col not in all_schema:
                    validation_status_col = False
                else:
                    validation_status_col = True

            # Validate data types
            for dtype in all_dtypes:
                if dtype not in all_schema_dtypes:
                    validation_status_dtype = False
                else:
                    validation_status_dtype = True
                    
            # Check for Duplicate rows
            if data.duplicated().any():
                # 'No duplicate rows: False'
                validation_status_duplicates = False
            else:
                # 'No duplicate rows: True'
                validation_status_duplicates = True

            # Write final results
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"Columns Validation status: {validation_status_col}\n")
                f.write(f"Dtypes Validation status: {validation_status_dtype}\n")
                f.write(f"No Duplicate Rows status: {validation_status_duplicates}\n")

            return validation_status_col, validation_status_dtype, validation_status_duplicates
        except Exception as e:
            raise e

# 7. Update the pipeline 

In [None]:
# Test run
try:
    config = ConfigurationManager()
    data_validation_config = config.get_validation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2024-11-20 02:12:53,263: 35 mlprojectLogger: INFO: common: .yaml file: config\config.yaml loaded successfully.]
[2024-11-20 02:12:53,266: 35 mlprojectLogger: INFO: common: .yaml file: params.yaml loaded successfully.]
[2024-11-20 02:12:53,274: 35 mlprojectLogger: INFO: common: .yaml file: schema.yaml loaded successfully.]
[2024-11-20 02:12:53,276: 54 mlprojectLogger: INFO: common: Created directory at artifacts]
[2024-11-20 02:12:53,278: 54 mlprojectLogger: INFO: common: Created directory at artifacts/data_validation]


## EDA

* Performed in `research/EDA.ipynb`