In [1]:
import os
os.chdir('../')
%pwd

'/home/paladin/Downloads/Consumer-Finance-Complaint-Analysis'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    feature_store_file_path: Path
    accepted_data_dir: Path
    rejected_data_dir: Path
    file_name: str

In [3]:
from datetime import datetime
from financeComplaint.constants import *
from financeComplaint.utils import read_yaml_file, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,                 
                 params_filepath=PARAMS_FILE_PATH,
                 saved_modelpath=SAVED_MODEL_PATH,
                 ):
       
        self.config = read_yaml_file(config_filepath)
        self.params = read_yaml_file(params_filepath)
        self.saved_modelpath = saved_modelpath
        
        create_directories([self.config.artifacts_root])
        self.timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        SUB_ROOT_DIR = os.path.join(config.ROOT_DIR, self.timestamp)
        ACCEPTED_DATA_DIR = os.path.join(SUB_ROOT_DIR, 'accepted_data')
        REJECTED_DATA_DIR = os.path.join(SUB_ROOT_DIR, 'rejected_data')
        FEATURE_STORE_FILE_PATH = os.path.join(self.config.data_ingestion.FEATURE_STORE_DIR,
                                               self.config.data_ingestion.FILE_NAME)

        create_directories([config.ROOT_DIR, ACCEPTED_DATA_DIR, REJECTED_DATA_DIR ])

        data_validation_config = DataValidationConfig(
            root_dir = config.ROOT_DIR,
            feature_store_file_path= FEATURE_STORE_FILE_PATH,
            accepted_data_dir = ACCEPTED_DATA_DIR,
            rejected_data_dir = REJECTED_DATA_DIR,
            file_name= config.FILE_NAME,

        )

        return data_validation_config

In [5]:
import os
import sys
from dataclasses import dataclass
from financeComplaint.logger import logging
from financeComplaint.exception import CustomException
from financeComplaint.config.spark_manager import spark_session
from pyspark.sql.functions import lit, col
from financeComplaint.entity.schema import FinanceDataSchema
from financeComplaint.entity.artifact_entity import DataValidationArtifact
from pyspark.sql import DataFrame
from typing import List, Dict

23/10/12 15:54:17 WARN Utils: Your hostname, ds-xps resolves to a loopback address: 127.0.1.1; using 192.168.2.16 instead (on interface wlp2s0)
23/10/12 15:54:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/paladin/Downloads/Consumer-Finance-Complaint-Analysis/venv/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/paladin/.ivy2/cache
The jars for the packages stored in: /home/paladin/.ivy2/jars
com.amazonaws#aws-java-sdk added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f79a42e3-595b-4a4d-9069-c5348c381c32;1.0
	confs: [default]
	found com.amazonaws#aws-java-sdk;1.7.4 in central
	found commons-logging#commons-logging;1.1.1 in central
	found org.apache.httpcomponents#httpclient;4.2 in central
	found org.apache.httpcomponents#httpcore;4.2 in central
	found commons-codec#commons-codec;1.3 in central
	found com.fasterxml.jackson.core#jackson-core;2.1.1 in central
	found com.fasterxml.jackson.core#jackson-databind;2.1.1 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.1.1 in central
	found joda-time#joda-time;2.12.5 in central
	[2.12.5] joda-time#joda-time;[2.2,)
	found org.apache.hadoop#hadoop-aws;2.7.3 in central
	found org.apache.hadoop#hadoop-common;2.7.3 in ce

23/10/12 15:54:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
@dataclass(frozen=True)
class MissingReport:
    total_row: int
    missing_row: list
    missing_percentage: float

COMPLAINT_TABLE = "complaint"
ERROR_MESSAGE = "error_msg"

class DataValidation:
    def __init__(self, config: DataValidationConfig, table_name: str = COMPLAINT_TABLE, schema=FinanceDataSchema()):
        self.config = config
        self.table_name = table_name
        self.schema = schema

    def read_data(self) -> DataFrame:
        try:
            dataframe: DataFrame = spark_session.read.parquet(self.config.feature_store_file_path)
            logging.info(f"Data frame is created using file: {self.config.feature_store_file_path}")
            logging.info(f"Number of row: {dataframe.count()} and column: {len(dataframe.columns)}")
            # Why: only for cutting data size for testing
            # Here, only 10 percent of data will be used
            dataframe, _ = dataframe.randomSplit([0.1, 0.90])
            return dataframe

        except Exception as e:
            raise CustomException(e, sys)
        
    @staticmethod
    def get_missing_report(dataframe: DataFrame) -> Dict[str, MissingReport]:
        try:
            missing_report: Dict[str:MissingReport] = dict()
            logging.info(f"Preparing missing reports for each column")
            number_of_row = dataframe.count()
            for column in dataframe.columns:
                missing_row = dataframe.filter(f"{column} is null").count()
                missing_percentage = (missing_row * 100) / number_of_row
                missing_report[column] = MissingReport(total_row=number_of_row,
                                                        missing_row=missing_row,
                                                        missing_percentage=missing_percentage
                                                        )
            logging.info(f"Missing report prepared: {missing_report}")
            return missing_report
        
        except Exception as e:
            raise CustomException(e, sys)

    def get_unwanted_and_high_missing_value_columns(self, dataframe: DataFrame, threshold: float= 0.3) -> List[str]:
        try:
            missing_report: Dict[str, MissingReport] = self.get_missing_report(dataframe=dataframe)
            unwanted_column: List[str] = self.schema.unwanted_columns
            for column in missing_report:
                if missing_report[column].missing_percentage > (threshold * 100):
                    unwanted_column.append(column)
                    logging.info(f"Missing report {column}: [{missing_report[column]}]")
                unwanted_column = list(set(unwanted_column))
            return unwanted_column
        except Exception as e:
            raise CustomException(e, sys)
        
    def drop_unwanted_columns(self, dataframe: DataFrame) -> DataFrame:
        try:
            unwanted_columns: List = self.get_unwanted_and_high_missing_value_columns(dataframe=dataframe, )
            logging.info(f"Dropping feature: {','.join(unwanted_columns)}")
            unwanted_dataframe: DataFrame = dataframe.select(unwanted_columns)

            unwanted_dataframe = unwanted_dataframe.withColumn(ERROR_MESSAGE, lit("Contains many missing values"))

            rejected_dir = os.path.join(self.config.rejected_data_dir, "missing_data")
            os.makedirs(rejected_dir, exist_ok=True)
            file_path = os.path.join(rejected_dir, self.config.file_name)

            logging.info(f"Writing dropped column into file: [{file_path}]")
            unwanted_dataframe.write.mode("append").parquet(file_path)
            dataframe: DataFrame = dataframe.drop(*unwanted_columns)
            logging.info(f"Remaining number of columns: [{dataframe.columns}]")
            return dataframe
        except Exception as e:
            raise CustomException(e, sys)
        
    @staticmethod
    def get_unique_values_of_each_column(dataframe: DataFrame) -> None:
        try:
            for column in dataframe.columns:
                n_unique: int = dataframe.select(col(column)).distinct().count()
                n_missing: int = dataframe.filter(col(column).isNull()).count()
                missing_percentage: float = (n_missing * 100) / dataframe.count()
                logging.info(f"Column: {column} contains {n_unique} value and missing perc: {missing_percentage} %.")
        except Exception as e:
            raise CustomException(e, sys)
        

    def is_required_columns_exist(self, dataframe: DataFrame):
        try:
            columns = list(filter(lambda x: x in self.schema.required_columns,
                                  dataframe.columns))

            if len(columns) != len(self.schema.required_columns):
                raise Exception(f"Required column missing\n\
                 Expected columns: {self.schema.required_columns}\n\
                 Found columns: {columns}\
                 ")

        except Exception as e:
            raise CustomException(e, sys)
        
    def initiate_data_validation(self) -> DataValidationArtifact:
        try:
            logging.info(f"Initiating data preprocessing.")
            dataframe: DataFrame = self.read_data()            

            logging.info(f"Dropping unwanted columns")
            dataframe: DataFrame = self.drop_unwanted_columns(dataframe=dataframe)

            # validation to ensure that all require column available
            self.is_required_columns_exist(dataframe=dataframe)

            logging.info("Saving preprocessed data.")
            print(f"Row: [{dataframe.count()}] Column: [{len(dataframe.columns)}]")
            print(f"Expected Column: {self.schema.required_columns}\nPresent Columns: {dataframe.columns}")

            os.makedirs(self.config.accepted_data_dir, exist_ok=True)
            accepted_file_path = os.path.join(self.config.accepted_data_dir,
                                              self.config.file_name
                                              )
            dataframe.write.parquet(accepted_file_path)

            artifact = DataValidationArtifact(accepted_data_file_path= accepted_file_path,
                                              rejected_data_dir= self.config.rejected_data_dir
                                              )
            logging.info(f"Data validation artifact: [{artifact}]")
            return artifact
        except Exception as e:
            raise CustomException(e, sys)

                 

In [7]:
import sys
from financeComplaint.exception import CustomException

In [8]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.initiate_data_validation()
except Exception as e:
    raise CustomException(e, sys)

                                                                                

Row: [87675] Column: [13]
Expected Column: ['consumer_disputed', 'company_response', 'consumer_consent_provided', 'submitted_via', 'issue', 'date_sent_to_company', 'date_received']
Present Columns: ['company', 'company_response', 'consumer_consent_provided', 'consumer_disputed', 'date_received', 'date_sent_to_company', 'issue', 'product', 'state', 'sub_issue', 'submitted_via', 'timely', 'zip_code']


                                                                                