In [11]:
import os
print(os.getcwd())
os.chdir("d:/vscode_machineLearning/BEST_PROJECTS/ForbsBillionaires")
print(os.getcwd())

d:\vscode_machineLearning\BEST_PROJECTS\ForbsBillionaires
d:\vscode_machineLearning\BEST_PROJECTS\ForbsBillionaires


## Entity

In [12]:
from dataclasses import dataclass
from pathlib import Path

In [13]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    data_path : Path
    Status_file : str
    train_df_path : Path
    test_df_path : Path

## configuration

In [14]:
from predictor.constatns import *
from predictor.utils import *
from pprint import pprint

In [15]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_file_path)
        self.schema = read_yaml(schema_file_path)

        create_dirs([self.config.artifacts_root])
        
    
    def get_data_transformation_config(self):
        config = self.config.data_transformation

        create_dirs([config.root_dir])

        return DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            Status_file=config.STATUS_FILE,
            train_df_path=config.train_df_path,
            test_df_path=config.test_df_path
        )
        

In [16]:
pprint(read_yaml(CONFIG_FILE_PATH))

[2023-09-15 10:36:46,213: INFO: utils: yaml file: config\config.yaml loaded successfully]
ConfigBox({'artifacts_root': 'artifacts', 'data_validation': {'root_dir': 'artifacts\\validation', 'data_dir': 'artifacts\\cleand_data\\billionaires.csv', 'STATUS_FILE': 'artifacts\\cleand_data\\data_validation_status.txt'}, 'data_transformation': {'root_dir': 'artifacts\\transformed_data', 'data_path': 'artifacts\\cleand_data\\encoded_data.csv', 'STATUS_FILE': 'artifacts\\cleand_data\\data_validation_status.txt', 'train_df_path': 'artifacts\\transformed_data\\train.csv', 'test_df_path': 'artifacts\\transformed_data\\test.csv'}})


## components

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

In [19]:
class TransformData:
    def __init__(self,config_transform_data:DataTransformationConfig) -> None:
        self.trans_config = config_transform_data

    def train_test_split_func(self):
        df = pd.read_csv(self.trans_config.data_path)
        train_df , test_df = train_test_split(df,test_size=0.23)

        # train_df.to_csv(self.trans_config.train_df_path,index=False)
        # test_df.to_csv(self.trans_config.test_df_path,index=False)

        logger.info("Splited data into training and test sets")
        logger.info(train_df.shape)
        logger.info(test_df.shape)

        return (
            train_df,test_df
        )

    def scale_data(self):
        train_df , test_df = self.train_test_split_func()
        X_train = train_df.drop(columns=['net_worth','full_name'])
        X_test = test_df.drop(columns=['net_worth','full_name'])
        transformer = ColumnTransformer(transformers=[
                ('scale_values',MinMaxScaler(),['rank', 'age', 'country_of_citizenship', 'business_category', 'wealth_status'])

            ],remainder='passthrough')
        
        X_train_transformed = transformer.fit_transform(X_train)
        X_test_transformed = transformer.fit_transform(X_test)

        train_transformed = pd.DataFrame(data=X_train_transformed,columns=transformer.get_feature_names_out())
        test_transformed = pd.DataFrame(data=X_test_transformed,columns=transformer.get_feature_names_out())

        train_transformed.to_csv(self.trans_config.train_df_path,index=False)
        test_transformed.to_csv(self.trans_config.test_df_path,index=False)
        
        logger.info("transformed data")
        logger.info(train_transformed.shape)
        logger.info(test_transformed.shape)   


    
        # return train_transformed , test_transformed
    
        

## Pipe line

In [20]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = TransformData(config_transform_data=data_transformation_config)
    data_transformation.scale_data()
except Exception as e:
    raise e

[2023-09-15 10:37:34,545: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2023-09-15 10:37:34,562: INFO: utils: yaml file: schema.yaml loaded successfully]
[2023-09-15 10:37:34,564: INFO: utils: Created artifacts]
[2023-09-15 10:37:34,565: INFO: utils: Created artifacts\transformed_data]
[2023-09-15 10:37:34,588: INFO: 3388847474: Splited data into training and test sets]
[2023-09-15 10:37:34,589: INFO: 3388847474: (1800, 9)]
[2023-09-15 10:37:34,590: INFO: 3388847474: (538, 9)]
[2023-09-15 10:37:34,637: INFO: 3388847474: transformed data]
[2023-09-15 10:37:34,640: INFO: 3388847474: (1800, 7)]
[2023-09-15 10:37:34,641: INFO: 3388847474: (538, 7)]
