In [1]:
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow'

# 1. Update config.yaml

# 2. Update the entity

In [4]:
from dataclasses import dataclass
from pathlib import Path


In [None]:
@dataclass(frozen=True)
class OutlierDetectionConfig:
    """
    Configuration class for outlier detection in a dataset.

    This dataclass contains the file paths and directories needed for identifying and 
    handling outliers during data preprocessing. The `frozen=True` parameter ensures 
    immutability of the configuration object.

    Attributes:
        root_dir (Path): The root directory for storing all related files and outputs.
        manipulated_data_dir (str): Path to the directory containing the data after initial manipulations.
        data_without_outliers_dir (str): Path to the directory where data without outliers will be saved.
        STATUS_FILE (str): Path to the file for logging the status of outlier detection operations.
    """
    root_dir: Path
    manipulated_data_dir: str
    data_without_outliers_dir: str
    STATUS_FILE: str

# 3. Update the configuration manager in src config

In [6]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    """
    Manages the configuration and setup for the project.

    This class is responsible for reading configuration files, creating required directories, 
    and providing specific configuration objects needed for various components of the project.

    Attributes:
        config (dict): Parsed content of the main configuration file.
        params (dict): Parsed content of the parameters file.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
        ) -> None:
        """
        Initializes the ConfigurationManager.

        Reads YAML configuration files for main configuration, parameters, and schema. 
        Also ensures that the root artifacts directory specified in the configuration is created.

        Args:
            config_filepath (str): Path to the main configuration YAML file. Default is `CONFIG_FILE_PATH`.
            params_filepath (str): Path to the parameters YAML file. Default is `PARAMS_FILE_PATH`.           
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_outlier_detection_config(self) -> OutlierDetectionConfig:
        try:
            config = self.config.outlier_detection
            create_directories([config.root_dir])
            
            outlier_detection_config = OutlierDetectionConfig(
                root_dir=config.root_dir,
                manipulated_data_dir=config.manipulated_data_dir,
                data_without_outliers_dir=config.data_without_outliers_dir,
                STATUS_FILE=config.STATUS_FILE
            )
            return outlier_detection_config
        except Exception as e:
            raise e
        
        

# 4. Update the components

In [9]:
from mlproject import logger
import pandas as pd
from mlproject.utils.common import get_outliers

In [None]:
class OutlierDetection:
    """
    Handles the detection and removal of outliers in a dataset.

    This class is responsible for identifying outliers in numeric columns of a dataset, 
    removing them, and saving the cleaned data to a specified directory. It also logs 
    the status of the operation.

    Attributes:
        config (OutlierDetectionConfig): Configuration object containing file paths and 
                                         directories needed for outlier detection.
    """
    
    def __init__(self, config: OutlierDetectionConfig) -> None:
        """
        Initializes the OutlierDetection object.

        Args:
            config (OutlierDetectionConfig): Configuration object containing file paths 
                                             and directories for outlier detection.
        """
        self.config = config
        
    def handle_outliers(self) -> pd.DataFrame:
        """
        Detects and removes outliers from specific numeric columns in the dataset.

        - Identifies outliers in predefined numeric columns using the `get_outliers` function.
        - Drops rows with outliers from the dataset.
        - Saves the cleaned dataset to the configured directory.
        - Logs the status of the operation in the status file.

        Returns:
            pd.DataFrame: The DataFrame after removing outliers.

        Raises:
            Exception: If any error occurs during the outlier handling process.
        """
        try:
            df = pd.read_csv(self.config.manipulated_data_dir)
            numeric_columns_with_outliers = ['loan_amount', 'income', 'Upfront_charges', 'Interest_rate_spread', 'rate_of_interest', 'property_value', 'dtir1', 'LTV']

            for col in numeric_columns_with_outliers:
                outliers = get_outliers(df, col)
                df.drop(df[outliers].index, inplace=True)
                
            # Save DataFrame without outliers
            df_without_outliers_file_path = self.config.data_without_outliers_dir  # File path for saving
            create_directories([os.path.dirname(df_without_outliers_file_path)])  # creates this directory "artifacts/outlier_detection" if it doesn't already exist.
            df.to_csv(df_without_outliers_file_path, index=False)
            
            # Log status
            with open(self.config.STATUS_FILE, "a") as status_file:
                status_file.write("Outliers handles and data saved successfully.\n")
            return df
        except Exception as e:
            raise e

# 5. Update pipeline

In [11]:
try:
    config = ConfigurationManager()
    outlier_detection_config = config.get_outlier_detection_config()
    outlier_handler = OutlierDetection(outlier_detection_config)
    outlier_handler.handle_outliers()
except Exception as e:
    raise e

[2024-11-20 12:57:02,327: 35 mlprojectLogger: INFO: common: .yaml file: config\config.yaml loaded successfully.]
[2024-11-20 12:57:02,330: 35 mlprojectLogger: INFO: common: .yaml file: params.yaml loaded successfully.]
[2024-11-20 12:57:02,330: 54 mlprojectLogger: INFO: common: Created directory at artifacts]
[2024-11-20 12:57:02,330: 54 mlprojectLogger: INFO: common: Created directory at artifacts/outlier_detection]
[2024-11-20 12:57:05,140: 54 mlprojectLogger: INFO: common: Created directory at artifacts/outlier_detection]
