In [1]:
import os 
import pandas as pd 
import numpy as np 

In [2]:
%pwd
os.chdir('../')
%pwd

'd:\\pythonProjects\\SurgeSense'

In [3]:
data=pd.read_csv('artifacts/data_ingestion/cab_data/cleaned_dataset.csv')
data.head()

Unnamed: 0,distance,cab_type,destination,source,price,surge_multiplier,name,date_time,temp,location,clouds,pressure,rain,humidity,wind,day,hour,month
0,0.44,Lyft,North Station,Haymarket Square,5.0,1.0,Shared,2018-12-16 09:30:07.890000105,38.46,Haymarket Square,0.29,1022.25,0.0,0.76,7.68,6,9,12
1,0.44,Lyft,North Station,Haymarket Square,11.0,1.0,Lux,2018-11-27 02:00:23.677000046,44.31,Haymarket Square,1.0,1003.17,0.1123,0.9,13.69,1,2,11
2,0.44,Lyft,North Station,Haymarket Square,11.0,1.0,Lux,2018-11-27 02:00:23.677000046,43.82,Haymarket Square,0.99,1002.59,0.0997,0.89,11.57,1,2,11
3,0.44,Lyft,North Station,Haymarket Square,26.0,1.0,Lux Black XL,2018-11-30 04:53:02.749000072,35.08,Haymarket Square,0.0,1013.71,0.0,0.7,5.25,4,4,11
4,0.44,Lyft,North Station,Haymarket Square,9.0,1.0,Lyft XL,2018-11-29 03:49:20.223000050,37.58,Haymarket Square,0.42,998.64,0.0,0.71,11.3,3,3,11


In [4]:
data.isnull().sum()

distance            0
cab_type            0
destination         0
source              0
price               0
surge_multiplier    0
name                0
date_time           0
temp                0
location            0
clouds              0
pressure            0
rain                0
humidity            0
wind                0
day                 0
hour                0
month               0
dtype: int64

In [5]:
data.shape

(1164996, 18)

In [6]:
data.columns

Index(['distance', 'cab_type', 'destination', 'source', 'price',
       'surge_multiplier', 'name', 'date_time', 'temp', 'location', 'clouds',
       'pressure', 'rain', 'humidity', 'wind', 'day', 'hour', 'month'],
      dtype='object')

In [7]:
# entity 
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path 
    STATUS_FILE: str 
    unzip_data_dir: Path 
    all_schema: dict

In [8]:
# config 
from SurgeSense.constants import * 
from SurgeSense.utils.common import read_yaml, create_directories


class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 param_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.param=read_yaml(param_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self)-> DataValidationConfig:
        config=self.config.data_validation
        schema=self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config=DataValidationConfig(
            unzip_data_dir=config.unzip_data_dir,
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema 
        )
        return data_validation_config
        

In [37]:
# components
import os 
from SurgeSense import logger 


class DataValidation:
    def __init__(self,config:DataValidationConfig):
        self.config=config

    def validate_all_columns(self)->bool:
        try:
            validate_status=None 
            data=pd.read_csv(self.config.unzip_data_dir)
            all_cols=data.dtypes.to_dict()
            all_schema=self.config.all_schema
            
            if list(all_cols.keys())!=list(all_schema.keys()):
                validate_status=False

            else:
                # for col in all_schema:
                #     print(all_schema[col])
                #     print(all_cols[col])
                dtype_match=all(all_cols[col] == all_schema[col] for col in all_schema)
                expected_columns=data.shape[1]
                column_count_match=(len(all_schema))
            #    print(dtype_match, expected_columns, column_count_match)
                validate_status=expected_columns==column_count_match and dtype_match
            with open(self.config.STATUS_FILE,'w') as f:
                f.write(f'Validation status: {validate_status}')
            logger.info(f'Validation status: {validate_status}')
            return validate_status
        except Exception as e:
            raise e
            

In [38]:
# pipeline 
try: 
    config=ConfigurationManager()
    data_validation_config=config.get_data_validation_config()
    data_validation=DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-03-20 07:09:06,544: INFO :common : yaml file: config\config.yaml loaded successfully]
[2025-03-20 07:09:06,546: INFO :common : yaml file: params.yaml loaded successfully]
[2025-03-20 07:09:06,549: INFO :common : yaml file: schema.yaml loaded successfully]
[2025-03-20 07:09:06,550: INFO :common : created directory at: artifacts]
[2025-03-20 07:09:06,552: INFO :common : created directory at: artifacts/data_validation]
[2025-03-20 07:09:08,709: INFO :542472409 : Validation status: True]
