In [1]:
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow'

# 1. Update config.yaml

# 2. Update the entity

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class MissingValuesConfig:
    """
    Configuration class for handling missing values in a dataset.

    This dataclass stores the configuration details required for identifying and managing 
    missing values during data preprocessing. The `frozen=True` parameter ensures that 
    instances of this class are immutable.

    Attributes:
        root_dir (Path): The root directory where all related files and data are stored.
        unzip_data_dir (Path): Directory containing the unzipped raw data files.
        cleaned_data_dir (str): Directory where the cleaned data files will be stored 
                                after handling missing values.
        STATUS_FILE (str): Path to the file where the status of missing value handling 
                           will be logged.
    """
    root_dir: Path
    unzip_data_dir: Path
    cleaned_data_dir: str
    STATUS_FILE: str

# 3. Update the configuration manager in src config

In [4]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    """
    Manages the configuration and setup for the project.

    This class is responsible for reading configuration files, creating required directories, 
    and providing specific configuration objects needed for various components of the project.

    Attributes:
        config (dict): Parsed content of the main configuration file.
        params (dict): Parsed content of the parameters file.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
        ) -> None:
        """
        Initializes the ConfigurationManager.

        Reads YAML configuration files for main configuration, parameters, and schema. 
        Also ensures that the root artifacts directory specified in the configuration is created.

        Args:
            config_filepath (str): Path to the main configuration YAML file. Default is `CONFIG_FILE_PATH`.
            params_filepath (str): Path to the parameters YAML file. Default is `PARAMS_FILE_PATH`.           
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_missing_values_config(self) -> MissingValuesConfig:
        """
        Creates and returns a `MissingValuesConfig` object for handling missing values.

        This method retrieves the configuration details specific to handling missing values from 
        the main configuration file. It also ensures that the directories required for this step 
        are created.

        Returns:
            MissingValuesConfig: An instance of `MissingValuesConfig` containing paths 
            and other configuration details for handling missing values.

        Raises:
            Exception: If an error occurs while retrieving the missing values configuration.
        """
        try:
            config = self.config.handle_missing_values
            create_directories([config.root_dir])
            
            missing_values_config = MissingValuesConfig(
                root_dir=config.root_dir,
                unzip_data_dir=config.unzip_data_dir,
                cleaned_data_dir=config.cleaned_data_dir,
                STATUS_FILE=config.STATUS_FILE,
            )
            return missing_values_config
        except Exception as e:
            raise e
    

# 4. Update the components

In [6]:
from mlproject import logger
import pandas as pd
from mlproject.utils.common import get_missing_columns

In [7]:
class HandleMissingValues:
    def __init__(self, config: MissingValuesConfig):
        self.config = config
           
    def handle_missing_values(self) -> pd.DataFrame:
        try:
            columns_having_rows_to_drop = [
                    'loan_limit',
                    'approv_in_adv',
                    'loan_purpose',
                    'Neg_ammortization',
                    'age',
                    'submission_of_application',
                    'term']
            
            df = pd.read_csv(self.config.unzip_data_dir)
            
            # Drop rows with missing values in these columns
            df.dropna(subset=columns_having_rows_to_drop, axis=0, inplace=True)
        
            df['income'] = df.groupby('age')['income'].transform(lambda x: x.fillna(x.median()))                    
            df['property_value'] = df.groupby('Region')['property_value'].transform(lambda x: x.fillna(x.median()))
            df['rate_of_interest'] = df['rate_of_interest'].transform(lambda x: x.fillna(x.mean()))
            df['Interest_rate_spread'] = df['Interest_rate_spread'].transform(lambda x: x.fillna(x.mean()))
            df['Upfront_charges'] = df['Upfront_charges'].transform(lambda x: x.fillna(x.median()))
            df['LTV'] = df['LTV'].fillna((df['loan_amount'] / df['property_value']) * 100)
            df['dtir1'] = df['dtir1'].interpolate(method='linear')

            
            # Save the cleaned DataFrame
            cleaned_file_path = self.config.cleaned_data_dir  # File path for saving
            create_directories([os.path.dirname(cleaned_file_path)])  # creates this directory "artifacts/handle_missing_values" if it doesn't already exist.
            df.to_csv(cleaned_file_path, index=False)
            
            # Log status
            with open(self.config.STATUS_FILE, "a") as status_file:
                status_file.write("Missing values handled and data saved successfully.\n\n")
            return df
        except Exception as e:
            raise e
        
    def list_missing_values(self):
        try:            
            data_dir = self.config.cleaned_data_dir
            df = pd.read_csv(data_dir)
            # Filter columns with missing values
            numeric_columns_with_na, category_columns_with_na, _, _ = get_missing_columns(df)

            # Write final results
            with open(self.config.STATUS_FILE, "a") as f:
                f.write("Numeric Columns.\n")
                f.write(f"Columns with Missing Values: {numeric_columns_with_na}")
                f.write("\nCategorical Columns.\n")
                f.write(f"Columns with Missing Values: {category_columns_with_na}")

        except Exception as e:
            raise e

# 5. Update the pipeline 

In [9]:
# test run
try:
    # Initialize configuration manager
    config = ConfigurationManager()
    
    # Get missing values configuration
    missing_values_config = config.get_missing_values_config()
    
    # Handle missing values
    missing_values_handler = HandleMissingValues(missing_values_config)
    cleaned_df = missing_values_handler.handle_missing_values()
    
    # List missing values after handling
    missing_values_handler.list_missing_values()
    
    
except Exception as e:
    raise e

[2024-11-20 10:50:43,280: 35 mlprojectLogger: INFO: common: .yaml file: config\config.yaml loaded successfully.]
[2024-11-20 10:50:43,283: 35 mlprojectLogger: INFO: common: .yaml file: params.yaml loaded successfully.]
[2024-11-20 10:50:43,284: 54 mlprojectLogger: INFO: common: Created directory at artifacts]
[2024-11-20 10:50:43,286: 54 mlprojectLogger: INFO: common: Created directory at artifacts/handle_missing_values]
[2024-11-20 10:50:44,630: 54 mlprojectLogger: INFO: common: Created directory at artifacts/handle_missing_values]


In [10]:
cleaned_df.isna().sum()

ID                           0
year                         0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
income                       0
credit_type                  0
Credit_Score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
LTV                          0
Region                       0
Security_Type                0
Status  