In [2]:
import os 
%pwd

'/home/mrafi/Desktop/Books/Bootcamp/E2EMLOps/18-e2e/MLops_Ds1/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/mrafi/Desktop/Books/Bootcamp/E2EMLOps/18-e2e/MLops_Ds1'

In [5]:
import pandas as pd

data = pd.read_csv("artifacts/data_ingestion/winequality-red.csv", sep=";")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [7]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
data.dtypes.to_dict()['fixed acidity']

dtype('float64')

In [9]:
dtps = data.dtypes.to_dict()
dtps = {k:str(v) for k,v in data.dtypes.to_dict().items()}

In [10]:
dtps

{'fixed acidity': 'float64',
 'volatile acidity': 'float64',
 'citric acid': 'float64',
 'residual sugar': 'float64',
 'chlorides': 'float64',
 'free sulfur dioxide': 'float64',
 'total sulfur dioxide': 'float64',
 'density': 'float64',
 'pH': 'float64',
 'sulphates': 'float64',
 'alcohol': 'float64',
 'quality': 'int64'}

In [14]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
  root_dir: Path
  source_URL: str
  local_data_file: Path
  unzip_dir: Path

In [12]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfig:
  root_dir: Path
  STATUS_FILE: Path
  unzip_data_dir: Path
  all_schema: dict

In [15]:
from src.data_science.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH
from src.data_science.utils.common import read_yaml, create_dir
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, schema_file_path= SCHEMA_FILE_PATH, params_file_path=PARAMS_FILE_PATH ):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_dir([self.config.artifacts_toot])
        
    def get_dataingestion_config(self)-> DataIngestionConfig:
        config = self.config.data_ingestion
        create_dir([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
              root_dir= config.root_dir,
              source_URL=config.source_URL,
              local_data_file=config.local_data_file ,
              unzip_dir=config.unzip_dir
        )
        return data_ingestion_config
    def get_datavalidation_config(self)-> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS
        create_dir([config.root_dir])
        data_validation_config = DataValidationConfig(
              root_dir= config.root_dir,
              STATUS_FILE=config.STATUS_FILE,
              unzip_data_dir=config.unzip_data_dir ,
              all_schema=schema
        )
        return data_validation_config

In [16]:
from urllib import request
from src.data_science import logger
import zipfile
import pandas as pd
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_all_columns(self):
        try:
            validation_status = None
            data = pd.read_csv(self.config.unzip_data_dir, sep=";")
            all_cols = list(data.columns)
            all_types = {k:str(v) for k,v in data.dtypes.to_dict().items()}
            all_schema = self.config.all_schema
            for col,type in all_schema.items():
                if (col not in all_cols) or type != all_types[col]  :
                    validation_status = False
                    logger.info(f"Missing column {col} in input file or column type mismatch")
                    with open(self.config.STATUS_FILE, "w+") as f:
                        f.write(f" Issue with {col} in input data, setting Validation Status:{validation_status}")
                    break
            else:
                validation_status = True
                with open(self.config.STATUS_FILE, "w+") as f:
                    f.write(f"Validation Sucessful. Validation Status:{validation_status}")
                    logger.info(f"Input file passed validation")
        except Exception as e:
            raise e
    

In [17]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_datavalidation_config()
    data_validation = DataValidation(data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e 


[2025-05-03 07:25:47,187: INFO: common: config/config.yaml file loaded sucessfully]
[2025-05-03 07:25:47,189: INFO: common: params.yaml file loaded sucessfully]
[2025-05-03 07:25:47,192: INFO: common: schema.yaml file loaded sucessfully]
[2025-05-03 07:25:47,193: INFO: common: Directory artifacts created]
[2025-05-03 07:25:47,194: INFO: common: Directory artifacts/data_validation created]
[2025-05-03 07:25:47,200: INFO: 3693007339: Input file passed validation]
