In [1]:
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow'

# 1. Update config.yaml

# 2. Update the entity

In [3]:
from dataclasses import dataclass
from pathlib import Path

In [4]:
@dataclass
class DataManipulationConfig:
    root_dir: Path
    cleaned_data_dir: str
    manipulated_data_dir: str
    STATUS_FILE: str


# 3. Update the configuration manager in src config

In [5]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    """
    Manages the configuration and setup for the project.

    This class is responsible for reading configuration files, creating required directories, 
    and providing specific configuration objects needed for various components of the project.

    Attributes:
        config (dict): Parsed content of the main configuration file.
        params (dict): Parsed content of the parameters file.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
        ) -> None:
        """
        Initializes the ConfigurationManager.

        Reads YAML configuration files for main configuration, parameters, and schema. 
        Also ensures that the root artifacts directory specified in the configuration is created.

        Args:
            config_filepath (str): Path to the main configuration YAML file. Default is `CONFIG_FILE_PATH`.
            params_filepath (str): Path to the parameters YAML file. Default is `PARAMS_FILE_PATH`.           
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_data_manipulation_config(self) -> DataManipulationConfig:
        """
        Creates and returns a `DataManipulationConfig` object for manipulating data.

        This method retrieves the configuration details specific to manipulating data values from 
        the main configuration file. It also ensures that the directories required for this step 
        are created.

        Returns:
            DataManipulationConfig: An instance of `DataManipulationConfig` containing paths 
            and other configuration details for manipulating data.

        Raises:
            Exception: If an error occurs while retrieving the manipulating data configuration.
        """
        try:
            config = self.config.data_manipulation
            create_directories([config.root_dir])
            
            data_manipulation_config = DataManipulationConfig(
                root_dir=config.root_dir,
                cleaned_data_dir=config.cleaned_data_dir,
                manipulated_data_dir=config.manipulated_data_dir,
                STATUS_FILE=config.STATUS_FILE
            )
            return data_manipulation_config
        except Exception as e:
            raise e

# 4. Update the components

In [7]:
from mlproject import logger
import pandas as pd

In [None]:
class ManipulateData:
    """
    Handles data manipulation tasks for a dataset.

    This class is responsible for correcting data types and modifying values in a dataset 
    based on specific rules. It reads the data from configured directories, applies the 
    changes, saves the modified data, and logs the status of operations.

    Attributes:
        config (DataManipulationConfig): Configuration object containing file paths and 
                                         directories required for data manipulation.

    """
    def __init__(self, config: DataManipulationConfig) -> None:
        """
        Initializes the ManipulateData object.

        Args:
            config (DataManipulationConfig): Configuration object containing file paths and 
                                             directories required for data manipulation.
        """
        self.config = config
        
    def correct_dtype(self) -> pd.DataFrame:
        """
        Corrects the data types of specific columns in the dataset.

        - Converts the `term` column to an integer type.
        - Converts the `Status` column to an object type.
        - Saves the updated dataset to the configured directory.
        - Logs the success status to the status file.

        Returns:
            pd.DataFrame: The DataFrame with corrected data types.

        Raises:
            Exception: If an error occurs during the process.
        """
        try:
            df = pd.read_csv(self.config.cleaned_data_dir)
            
            df['term'] = df['term'].astype('int')
            df['Status'] = df['Status'].astype('object')
            
            # Save the corrected DataFrame
            corrected_df_file_path = self.config.manipulated_data_dir  # File path for saving
            create_directories([os.path.dirname(corrected_df_file_path)])  # creates this directory "artifacts/handle_missing_values" if it doesn't already exist.
            df.to_csv(corrected_df_file_path, index=False)
            
            # Log status
            with open(self.config.STATUS_FILE, "a") as status_file:
                status_file.write("Corrected dtypes and data saved successfully.\n")
            return df
        except Exception as e:
            raise e
        
    def change_values(self) -> pd.DataFrame:
        """
        Replaces specific values in the dataset based on predefined rules.

        - Fixes typos in the `Security_Type` column.
        - Replaces codes in the `occupancy_type` column with descriptive labels.
        - Saves the updated dataset to the configured directory.
        - Logs the success status to the status file.

        Returns:
            pd.DataFrame: The DataFrame with updated values.

        Raises:
            Exception: If an error occurs during the process.
        """
        try:
            df = pd.read_csv(self.config.manipulated_data_dir)
            
            df['Security_Type'] = df['Security_Type'].replace({'Indriect':'Indirect'}) 
            df['occupancy_type'] = df['occupancy_type'].replace({'pr':'Primary Residential', 'sr':'Secondary Residdential', 'ir':'Investment Residential'}) 
            
            # Save the corrected DataFrame
            corrected_df_file_path = self.config.manipulated_data_dir  # File path for saving
            create_directories([os.path.dirname(corrected_df_file_path)])  # creates this directory "artifacts/handle_missing_values" if it doesn't already exist.
            df.to_csv(corrected_df_file_path, index=False)
            
            # Log status
            with open(self.config.STATUS_FILE, "a") as status_file:
                status_file.write("Corrected values and data saved successfully.\n")
            return df
        except Exception as e:
            raise e
        

# 5. Update pipeline

In [9]:
try:
    config = ConfigurationManager()
    data_manipulation_config = config.get_data_manipulation_config()
    
    manipulator = ManipulateData(data_manipulation_config)
    manipulator.correct_dtype()
    manipulator.change_values()
    
except Exception as e:
    raise e

[2024-11-20 12:01:47,943: 35 mlprojectLogger: INFO: common: .yaml file: config\config.yaml loaded successfully.]
[2024-11-20 12:01:47,945: 35 mlprojectLogger: INFO: common: .yaml file: params.yaml loaded successfully.]
[2024-11-20 12:01:47,947: 54 mlprojectLogger: INFO: common: Created directory at artifacts]
[2024-11-20 12:01:47,949: 54 mlprojectLogger: INFO: common: Created directory at artifacts/data_manipulation]
[2024-11-20 12:01:48,875: 54 mlprojectLogger: INFO: common: Created directory at artifacts/data_manipulation]
[2024-11-20 12:01:52,663: 54 mlprojectLogger: INFO: common: Created directory at artifacts/data_manipulation]


In [10]:
a = pd.read_csv('artifacts/data_manipulation/Manipulated_Loan_Default.csv')
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143942 entries, 0 to 143941
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         143942 non-null  int64  
 1   year                       143942 non-null  int64  
 2   loan_limit                 143942 non-null  object 
 3   Gender                     143942 non-null  object 
 4   approv_in_adv              143942 non-null  object 
 5   loan_type                  143942 non-null  object 
 6   loan_purpose               143942 non-null  object 
 7   Credit_Worthiness          143942 non-null  object 
 8   open_credit                143942 non-null  object 
 9   business_or_commercial     143942 non-null  object 
 10  loan_amount                143942 non-null  int64  
 11  rate_of_interest           143942 non-null  float64
 12  Interest_rate_spread       143942 non-null  float64
 13  Upfront_charges            14

In [11]:
a['Security_Type'].unique()

array(['direct', 'Indirect'], dtype=object)

In [12]:
a['occupancy_type'].unique()

array(['Primary Residential', 'Secondary Residdential',
       'Investment Residential'], dtype=object)