In [1]:
import os

In [2]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection\\Exp'

In [3]:
os.chdir('../')

In [4]:
pwd%%

'f:\\Files\\DS&ML\\E2E-Credit-Fraud-Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    target_column: str
    preprocessor_path: Path
    label_encoder: Path
    categorical_columns: list
    numeric_columns: list
    columns_to_drop: list
    test_size: float
    random_state: int

In [6]:
from project.constants import *
from project.utils.common import *

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_PATH,
        params_filepath = PARAMS_PATH,
        schema_filepath = SCHEMA_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        schema = self.schema
        params = self.params.train_test_split

        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            target_column=schema.target_column.name,
            preprocessor_path=config.preprocessor_path,
            label_encoder=config.label_encoder,
            categorical_columns=schema.categorical_columns,
            numeric_columns=schema.numeric_columns,
            columns_to_drop=schema.data_cleaning.columns_to_drop,
            test_size=params.test_size,
            random_state=params.random_state
        )
        
        return data_transformation_config

In [8]:
import pandas as pd
import numpy as np
import joblib

In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
import joblib
from project import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.columns_to_drop = config.columns_to_drop
        self.target_column = config.target_column
        self.label_encoders = {}
        self.categorical_columns = config.categorical_columns
        self.numerical_columns = config.numeric_columns
        self.test_size = config.test_size
        self.random_state = config.random_state
        

    def preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            data = data.copy()

            data.drop(columns=self.columns_to_drop, inplace=True, errors='ignore')
            
            for column in self.categorical_columns:
                if column in data.columns:
                    le = LabelEncoder()
                    data[column] = le.fit_transform(data[column].astype(str))
                    self.label_encoders[column] = le            
            
            
            
            os.makedirs(os.path.dirname(self.config.label_encoder), exist_ok=True)
            joblib.dump(self.label_encoders, self.config.label_encoder)
            logger.info(f"Saved label encoders to {self.config.label_encoder}")
            
            return data
            
        except Exception as e:
            logger.error(f"Error in preprocess_data: {str(e)}")
            raise e

    def train_test_splitting(self):
        try:
            logger.info(f"Loading data from {self.config.data_path}")
            data = pd.read_csv(self.config.data_path)
            
            data = self.preprocess_data(data)
            data = data.dropna()
            
            
            X = data.drop(columns=[self.target_column])
            y = data[self.target_column]
            
            smote = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
            X_resampled, y_resampled = smote.fit_resample(X, y)
            
            X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
            resampled_data = X_resampled.copy()
            resampled_data[self.target_column] = y_resampled
            
            train, test = train_test_split(resampled_data, test_size= self.test_size, random_state=self.random_state)
            
            train_path = os.path.join(self.config.root_dir, "train.csv")
            test_path = os.path.join(self.config.root_dir, "test.csv")
            train.to_csv(train_path, index=False)
            test.to_csv(test_path, index=False)

            logger.info("Split data into training and test sets")
            logger.info(f"Training data shape: {train.shape}")
            logger.info(f"Test data shape: {test.shape}")

            return train, test
            
        except Exception as e:
            logger.error(f"Error in train_test_splitting: {e}")
            raise e
    
    def preprocess_features(self, train, test):
        try:
            numerical_columns = self.numerical_columns
            categorical_columns = self.categorical_columns

            if self.target_column in categorical_columns:
                categorical_columns.remove(self.target_column)

            logger.info(f"Numerical columns: {list(numerical_columns)}")
            logger.info(f"Categorical columns: {list(categorical_columns)}")

            num_pipeline = Pipeline(steps=[
                ("scaler", StandardScaler())
            ])
            
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", num_pipeline, numerical_columns)
                ],
                remainder="passthrough"
            )

            train_x = train.drop(columns=[self.target_column])
            test_x = test.drop(columns=[self.target_column])
            train_y = train[self.target_column]
            test_y = test[self.target_column]

            train_processed = preprocessor.fit_transform(train_x)
            test_processed = preprocessor.transform(test_x)

            train_y = train_y.values.reshape(-1, 1)
            test_y = test_y.values.reshape(-1, 1)

            train_combined = np.hstack((train_processed, train_y))
            test_combined = np.hstack((test_processed, test_y))

            joblib.dump(preprocessor, self.config.preprocessor_path)
            logger.info(f"Preprocessor saved at {self.config.preprocessor_path}")

            train_processed_path = os.path.join(self.config.root_dir, "train_processed.npy")
            test_processed_path = os.path.join(self.config.root_dir, "test_processed.npy")
            
            np.save(train_processed_path, train_combined)
            np.save(test_processed_path, test_combined)

            logger.info("Preprocessed train and test data saved successfully.")
            return train_processed, test_processed

        except Exception as e:
            logger.error(f"Error in preprocess_features: {e}")
            raise e

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    train, test = data_transformation.train_test_splitting()
    train_processed, test_processed = data_transformation.preprocess_features(train, test)
except FileNotFoundError as e:
    logger.error(f"File not found: {e}")
except KeyError as e:
    logger.error(f"Missing key in configuration: {str(e)}")
except Exception as e:
    logger.error(f"Unexpected error: {str(e)}") 

[2025-04-21 17:44:12,850: INFO: common: yaml file: yaml file\config.yaml loaded successfully]
[2025-04-21 17:44:12,870: INFO: common: yaml file: yaml file\params.yaml loaded successfully]
[2025-04-21 17:44:12,874: INFO: common: yaml file: yaml file\schema.yaml loaded successfully]
[2025-04-21 17:44:12,877: INFO: common: created directory at: artifacts]
[2025-04-21 17:44:12,878: INFO: common: created directory at: artifacts/data_transformation]
[2025-04-21 17:44:12,878: INFO: 2182098855: Loading data from artifacts/data_ingestion/Fraud-data.csv]
[2025-04-21 17:44:13,017: INFO: 2182098855: Saved label encoders to artifacts/data_transformation/label_encoders.pkl]
[2025-04-21 17:44:14,100: INFO: 2182098855: Split data into training and test sets]
[2025-04-21 17:44:14,100: INFO: 2182098855: Training data shape: (69840, 10)]
[2025-04-21 17:44:14,100: INFO: 2182098855: Test data shape: (17460, 10)]
[2025-04-21 17:44:14,100: INFO: 2182098855: Numerical columns: ['Transaction_Amount', 'Time_of_