In [1]:
import os

In [2]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\layeg\\Desktop\\GitHub\\Holland_Barret'

In [5]:
import pandas as pd

In [6]:
data = pd.read_csv(r"artifacts\data_ingestion\merged_data.csv")
data.head()

Unnamed: 0,Transaction ID,Customer ID,Date,Total Items,Unique Items,Total Sales,Discounted Sales,Browsing Duration (minutes),Number of Clicks,Incomplete Transaction,Age,Gender,Region,Marital Status,Education,Household Income,Loyalty Card,Loyalty Points
0,TRID_21210,CID_12160,2020-03-22,8,3,22.88,7.54,3.11,7,0,19,Female,Rural,Divorced,High School,21000.0,0,
1,TRID_83725,CID_11410,2020-01-08,4,2,22.8,4.79,7.51,15,0,41,Female,Urban,Divorced,High School,20000.0,1,5.0
2,TRID_10532,CID_12776,2020-02-19,5,3,14.8,7.96,8.98,16,0,42,Female,Urban,Married,Graduate,9000.0,0,
3,TRID_88885,CID_9162,2020-02-17,5,3,13.04,6.98,5.61,11,0,77,Male,Urban,Divorced,High School,84500.0,1,21.0
4,TRID_68790,CID_14594,2020-01-06,14,4,9.6,9.25,6.62,14,1,30,Male,Rural,Single,Graduate,53500.0,0,


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction ID               5000 non-null   object 
 1   Customer ID                  5000 non-null   object 
 2   Date                         5000 non-null   object 
 3   Total Items                  5000 non-null   int64  
 4   Unique Items                 5000 non-null   int64  
 5   Total Sales                  5000 non-null   float64
 6   Discounted Sales             5000 non-null   float64
 7   Browsing Duration (minutes)  5000 non-null   float64
 8   Number of Clicks             5000 non-null   int64  
 9   Incomplete Transaction       5000 non-null   int64  
 10  Age                          5000 non-null   int64  
 11  Gender                       5000 non-null   object 
 12  Region                       5000 non-null   object 
 13  Marital Status    

In [8]:
data.isnull().sum()

Transaction ID                    0
Customer ID                       0
Date                              0
Total Items                       0
Unique Items                      0
Total Sales                       0
Discounted Sales                  0
Browsing Duration (minutes)       0
Number of Clicks                  0
Incomplete Transaction            0
Age                               0
Gender                            0
Region                            0
Marital Status                    0
Education                       407
Household Income                  0
Loyalty Card                      0
Loyalty Points                 2565
dtype: int64

In [9]:
data.shape

(5000, 18)

In [10]:
data.columns

Index(['Transaction ID', 'Customer ID', 'Date', 'Total Items', 'Unique Items',
       'Total Sales', 'Discounted Sales', 'Browsing Duration (minutes)',
       'Number of Clicks', 'Incomplete Transaction', 'Age', 'Gender', 'Region',
       'Marital Status', 'Education', 'Household Income', 'Loyalty Card',
       'Loyalty Points'],
      dtype='object')

In [11]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [12]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    """
    Class for managing configuration files and retrieving data validation configuration.

    Attributes:
    config_filepath (str): The file path of the main configuration file.
    params_filepath (str): The file path of the parameters file.
    schema_filepath (str): The file path of the schema file.
    """

    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        """
        Initialize ConfigurationManager with file paths and read configuration files.

        Args:
        config_filepath (str): The file path of the main configuration file.
        params_filepath (str): The file path of the parameters file.
        schema_filepath (str): The file path of the schema file.
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Retrieve the data validation configuration from the main configuration.

        Returns:
        DataValidationConfig: The data validation configuration object.
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config


In [14]:
import os
from mlProject import logger

In [15]:
class DataValidation:
    """
    Class for data validation tasks.

    Attributes:
    config (DataValidationConfig): The configuration for data validation.
    """

    def __init__(self, config: DataValidationConfig):
        """
        Initialize DataValidation with a configuration object.

        Args:
        config (DataValidationConfig): The configuration for data validation.
        """
        self.config = config

    def validate_all_columns(self) -> bool:
        """
        Validate all columns in the dataset against the specified schema.

        Returns:
        bool: The validation status.
        """
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            raise e


In [16]:
try:
    # Initialize ConfigurationManager and retrieve data validation configuration
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()

    # Initialize DataValidation with the retrieved configuration
    data_validation = DataValidation(config=data_validation_config)

    # Validate all columns
    data_validation.validate_all_columns()

except Exception as e:
    # Raise the caught exception
    raise e


[2024-02-28 11:16:51,214: INFO: common: YAML file loaded successfully from: config\config.yaml]
[2024-02-28 11:16:51,217: INFO: common: YAML file loaded successfully from: params.yaml]
[2024-02-28 11:16:51,219: INFO: common: YAML file loaded successfully from: schema.yaml]
[2024-02-28 11:16:51,220: INFO: common: Created directory at: artifacts]
[2024-02-28 11:16:51,222: INFO: common: Created directory at: artifacts/data_validation]
