In [1]:
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow'

# 1. Update config.yaml

# 2. Update the entity

In [3]:
from dataclasses import dataclass
from pathlib import Path

In [5]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    X_train_data_path: str
    X_test_data_path: str
    y_train_data_path: str
    y_test_data_path: str
    

# 3. Update the configuration manager in src config

In [6]:
from mlproject.constants import *
from mlproject.utils.common import create_directories, read_yaml

In [None]:
class ConfigurationManager:
    """
    Manages the configuration and setup for the project.

    This class is responsible for reading configuration files, creating required directories, 
    and providing specific configuration objects needed for various components of the project.

    Attributes:
        config (dict): Parsed content of the main configuration file.
        params (dict): Parsed content of the parameters file.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
        ) -> None:
        """
        Initializes the ConfigurationManager.

        Reads YAML configuration files for main configuration, parameters, and schema. 
        Also ensures that the root artifacts directory specified in the configuration is created.

        Args:
            config_filepath (str): Path to the main configuration YAML file. Default is `CONFIG_FILE_PATH`.
            params_filepath (str): Path to the parameters YAML file. Default is `PARAMS_FILE_PATH`.           
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        try:
            config = self.config.data_transformation
            # Not necessary because we are not saving any file 
            # All transformed data would be saved back to `data_split` directory
            # create_directories([config.root_dir])
            
            data_transformation_config = DataTransformationConfig(
                root_dir=config.root_dir,
                X_train_data_path=config.X_train_data_path,
                X_test_data_path=config.X_test_data_path,
                y_train_data_path=config.y_train_data_path,
                y_test_data_path=config.y_test_data_path
            )
            return data_transformation_config
        except Exception as e:
            raise e

# 4. Update the components

In [9]:
import pandas as pd
from typing import Tuple
from mlproject import logger
from sklearn.preprocessing import LabelEncoder

In [16]:
class LabelEncoding:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    def apply_label_encoding(self) -> Tuple[pd.Series, pd.Series]:
        try:
            y_train = pd.read_csv(self.config.y_train_data_path)
            y_test = pd.read_csv(self.config.y_test_data_path)

            # Converting first column to Series 
            y_train = y_train.iloc[:,0]
            y_test = y_test.iloc[:,0]
            
            labelEncoder = LabelEncoder()

            y_train = labelEncoder.fit_transform(y_train)
            y_test = labelEncoder.transform(y_test)

            y_train = pd.Series(y_train, name='Status')
            y_test = pd.Series(y_test, name='Status')
            
            y_train.to_csv(self.config.y_train_data_path, index=False)
            y_test.to_csv(self.config.y_test_data_path, index=False)
            
            return y_train, y_test
        except Exception as e:
            raise e

In [17]:
class TargetEncoding:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    def apply_target_encoding(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        try:
            X_train = pd.read_csv(self.config.X_train_data_path)
            X_test = pd.read_csv(self.config.X_test_data_path)
            y_train = pd.read_csv(self.config.y_train_data_path)
            target_column ='Status'
            
            # Step 1: Combine `X_train` and `y_train` for mean encoding
            security_type_df = pd.concat([X_train['Security_Type'], y_train], axis=1)

            # Step 2: Calculate mean target for each category in the training set
            security_type_mean = security_type_df.groupby('Security_Type')[target_column].mean()

            # Step 3: Map mean encoding to the training set
            X_train['Security_Type'] = X_train['Security_Type'].map(security_type_mean)

            # Step 4: Map mean encoding to the test set
            X_test['Security_Type'] = X_test['Security_Type'].map(security_type_mean)

            # Step 5: Handle categories in test set that are missing in training
            fallback_value = y_train.mean()  # Overall mean target value
            X_test['Security_Type'] = X_test['Security_Type'].fillna(fallback_value)
            
            X_train.to_csv(self.config.X_train_data_path, index = False)
            X_test.to_csv(self.config.X_test_data_path, index = False)
            
            return X_train, X_test
        except Exception as e:
            raise e

In [18]:
class OneHotEncoding:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def apply_one_hot_encoding(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        try:
            
            X_train = pd.read_csv(self.config.X_train_data_path)
            X_test = pd.read_csv(self.config.X_test_data_path)
            nunique_2_to_3 = [
                'loan_limit',
                'approv_in_adv',
                'loan_type',
                'Credit_Worthiness',
                'open_credit',
                'business_or_commercial',
                'Neg_ammortization',
                'interest_only',
                'lump_sum_payment',
                'construction_type',
                'occupancy_type',
                'Secured_by',
                'co-applicant_credit_type',
                'submission_of_application',
                'Security_Type'
                ]
            
            # Remove 'Security_Type' as it will not be one-hot encoded
            nunique_2_to_3.remove('Security_Type')

            # One-hot encode on X_train and assign back to X_train
            X_train = pd.get_dummies(X_train, columns=nunique_2_to_3)

            # One-hot encode on X_test and ensure consistent columns with X_train
            X_test = pd.get_dummies(X_test, columns=nunique_2_to_3)
            X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
            
            X_train.to_csv(self.config.X_train_data_path, index = False)
            X_test.to_csv(self.config.X_test_data_path, index = False)
            
            return X_train, X_test
        except Exception as e:
            raise e

In [None]:
class FrequencyEncoding:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def apply_frequency_encoding(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        try:
            X_train = pd.read_csv(self.config.X_train_data_path)
            X_test = pd.read_csv(self.config.X_test_data_path)
            greater_than_3 = [
                'Gender', 'loan_purpose', 'total_units', 'credit_type', 'age', 'Region'
                ]
            
            for col in greater_than_3:
                # Step 1: Calculate frequency map from X_train
                frequency_map = X_train[col].value_counts().to_dict()
                
                # Step 2: Apply frequency map to X_train
                X_train[col] = X_train[col].map(frequency_map)
                
                # Step 3: Apply the same frequency map to X_test
                X_test[col] = X_test[col].map(frequency_map)
                
                # Step 4: Handle categories in X_test not seen in X_train
                X_test[col] = X_test[col].fillna(0)  # Replace NaN with 0
               
            # Save the transformed data back to data_split directory 
            X_train.to_csv(self.config.X_train_data_path, index = False)
            X_test.to_csv(self.config.X_test_data_path, index = False)
            
            return X_train, X_test
        except Exception as e:
            raise e

# 5. Update pipeline

In [None]:
# # Run this here first to check if it works wihout error
# # If it does then convert it to ml pipeline

# try:
#     config = ConfigurationManager()
#     data_transformation_config = config.get_data_transformation_config()
    
#     y_train, y_test = LabelEncoding(data_transformation_config).apply_label_encoding()
#     X_train, X_test = TargetEncoding(data_transformation_config).apply_target_encoding()
#     X_train, X_test = OneHotEncoding(data_transformation_config).apply_one_hot_encoding()
#     X_train, X_test = FrequencyEncoding(data_transformation_config).apply_frequency_encoding()
        
# except Exception as e:
#     raise e

[2024-11-21 12:36:07,074: 35 mlprojectLogger: INFO: common: .yaml file: config\config.yaml loaded successfully.]
[2024-11-21 12:36:07,074: 35 mlprojectLogger: INFO: common: .yaml file: params.yaml loaded successfully.]
[2024-11-21 12:36:07,074: 54 mlprojectLogger: INFO: common: Created directory at artifacts]
[2024-11-21 12:36:07,074: 54 mlprojectLogger: INFO: common: Created directory at artifacts/data_transformation]


In [26]:
a = pd.read_csv('artifacts/data_split/X_train.csv')
b = pd.read_csv('artifacts/data_split/X_test.csv')
c = pd.read_csv('artifacts/data_split/y_train.csv')
d = pd.read_csv('artifacts/data_split/y_test.csv')

# Converting first column to Series 
c = c.iloc[:,0]
d = d.iloc[:,0]

b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19383 entries, 0 to 19382
Data columns (total 49 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   ID                                     19383 non-null  int64  
 1   year                                   19383 non-null  int64  
 2   Gender                                 19383 non-null  int64  
 3   loan_purpose                           19383 non-null  int64  
 4   loan_amount                            19383 non-null  int64  
 5   rate_of_interest                       19383 non-null  float64
 6   Interest_rate_spread                   19383 non-null  float64
 7   Upfront_charges                        19383 non-null  float64
 8   term                                   19383 non-null  int64  
 9   property_value                         19383 non-null  float64
 10  total_units                            19383 non-null  int64  
 11  in

In [25]:
d

0        0
1        1
2        1
3        0
4        0
        ..
19378    0
19379    0
19380    0
19381    0
19382    1
Name: Status, Length: 19383, dtype: int64