In [42]:
import os
print(os.getcwd())
os.chdir("d:/vscode_machineLearning/BEST_PROJECTS/ForbsBillionaires")
print(os.getcwd())

d:\vscode_machineLearning\BEST_PROJECTS\ForbsBillionaires
d:\vscode_machineLearning\BEST_PROJECTS\ForbsBillionaires


## Entity

In [43]:
from dataclasses import dataclass
from pathlib import Path

In [44]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    data_path : Path
    Status_file : str
    transform_train_df_path : Path
    transform_test_df_path : Path
    feature_renamer_scehma:dict

## configuration

In [45]:
from predictor.constatns import *
from predictor.utils import *
from pprint import pprint

In [46]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_file_path)
        self.schema = read_yaml(schema_file_path)

        create_dirs([self.config.artifacts_root])
        
    
    def get_data_transformation_config(self):
        config = self.config.data_transformation
        schema = self.schema.columns_renamer

        create_dirs([config.root_dir])

        return DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            Status_file=config.STATUS_FILE,
            transform_train_df_path=config.transformed_train_df_path,
            transform_test_df_path=config.transformed_test_df_path,
            feature_renamer_scehma=schema
        )
        

In [47]:
pprint(read_yaml(CONFIG_FILE_PATH))

[2023-09-15 11:36:43,474: INFO: utils: yaml file: config\config.yaml loaded successfully]
ConfigBox({'artifacts_root': 'artifacts', 'data_validation': {'root_dir': 'artifacts\\validation', 'data_dir': 'artifacts\\cleand_data\\billionaires.csv', 'STATUS_FILE': 'artifacts\\cleand_data\\data_validation_status.txt'}, 'data_transformation': {'root_dir': 'artifacts\\transformed_data', 'data_path': 'artifacts\\cleand_data\\encoded_data.csv', 'STATUS_FILE': 'artifacts\\cleand_data\\data_validation_status.txt', 'transformed_train_df_path': 'artifacts\\transformed_data\\train_transformed.csv', 'transformed_test_df_path': 'artifacts\\transformed_data\\test_transformed.csv'}})


In [48]:
read_yaml(SCHEMA_FILE_PATH).columns_renamer

[2023-09-15 11:36:43,485: INFO: utils: yaml file: schema.yaml loaded successfully]


ConfigBox({'scale_values__rank': 'rank', 'scale_values__age': 'age', 'scale_values__country_of_citizenship': 'country_of_citizenship', 'scale_values__business_category': 'business_category', 'scale_values__wealth_status': 'wealth_status', 'remainder__gender': 'gender', 'remainder__self_made': 'self_made'})

## components

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

In [50]:
class TransformData:
    def __init__(self,config_transform_data:DataTransformationConfig) -> None:
        self.trans_config = config_transform_data

    def train_test_split_func(self):
        df = pd.read_csv(self.trans_config.data_path)
        train_df , test_df = train_test_split(df,test_size=0.23)

        logger.info("Splited data into training and test sets")
        logger.info(train_df.shape)
        logger.info(test_df.shape)

        return (
            train_df,test_df
        )

    def scale_data(self):
        train_df , test_df = self.train_test_split_func()
        X_train = train_df.drop(columns=['net_worth','full_name'])
        X_test = test_df.drop(columns=['net_worth','full_name'])
        transformer = ColumnTransformer(transformers=[
                ('scale_values',MinMaxScaler(),['rank', 'age', 'country_of_citizenship', 'business_category', 'wealth_status'])

            ],remainder='passthrough')
        
        X_train_transformed = transformer.fit_transform(X_train)
        X_test_transformed = transformer.fit_transform(X_test)

        train_transformed = pd.DataFrame(data=X_train_transformed,columns=transformer.get_feature_names_out())
        test_transformed = pd.DataFrame(data=X_test_transformed,columns=transformer.get_feature_names_out())

        train_transformed = train_transformed.rename(columns={
                                    'scale_values__rank': 'rank',
                                    'scale_values__age': 'age',
                                    'scale_values__country_of_citizenship': 'country_of_citizenship',
                                    'scale_values__business_category': 'business_category',
                                    'scale_values__wealth_status': 'wealth_status',
                                    'remainder__gender': 'gender',
                                    'remainder__self_made': 'self_made'
                                    }
                                    )
        
        test_transformed = test_transformed.rename(columns={
                                    'scale_values__rank': 'rank',
                                    'scale_values__age': 'age',
                                    'scale_values__country_of_citizenship': 'country_of_citizenship',
                                    'scale_values__business_category': 'business_category',
                                    'scale_values__wealth_status': 'wealth_status',
                                    'remainder__gender': 'gender',
                                    'remainder__self_made': 'self_made'
                                    }
                                    )

        # train_transformed = train_transformed.rename(self.trans_config.feature_renamer_scehma)
        # test_transformed = test_transformed.rename(self.trans_config.feature_renamer_scehma)
        logger.info(f'renamed the features name')

        

        train_transformed.to_csv(self.trans_config.transform_train_df_path,index=False)
        test_transformed.to_csv(self.trans_config.transform_test_df_path,index=False)
        
        logger.info("transformed data")
        logger.info(train_transformed.shape)
        logger.info(test_transformed.shape)   

        

## Pipe line

In [51]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = TransformData(config_transform_data=data_transformation_config)
    data_transformation.scale_data()
except Exception as e:
    raise e

[2023-09-15 11:36:43,522: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2023-09-15 11:36:43,524: INFO: utils: yaml file: schema.yaml loaded successfully]
[2023-09-15 11:36:43,526: INFO: utils: Created artifacts]
[2023-09-15 11:36:43,528: INFO: utils: Created artifacts\transformed_data]
[2023-09-15 11:36:43,534: INFO: 1091492594: Splited data into training and test sets]
[2023-09-15 11:36:43,535: INFO: 1091492594: (1800, 9)]
[2023-09-15 11:36:43,536: INFO: 1091492594: (538, 9)]
[2023-09-15 11:36:43,546: INFO: 1091492594: renamed the features name]
[2023-09-15 11:36:43,569: INFO: 1091492594: transformed data]
[2023-09-15 11:36:43,571: INFO: 1091492594: (1800, 7)]
[2023-09-15 11:36:43,572: INFO: 1091492594: (538, 7)]
