In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
%pwd

'c:\\Chandu\\WorkSpace\\Learnings\\Repos\\End-to-End-CreditCardFraudDetection-Project'

In [7]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path



In [15]:
from creditcard_fraud_detection.constants import *
from creditcard_fraud_detection.utils.common import read_yaml, create_directories

In [35]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):
        
        #self.config = read_yaml(config_filepath)
        #self.params = read_yaml(params_filepath)
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        #create_directories([self.config.artifacts_root])
        create_directories([self.config.artifacts_root])
        
    
    def get_data_transformation(self)->DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path=config.data_path
        )

        return data_transformation_config

In [13]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from creditcard_fraud_detection.logging import logger
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

In [47]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config = config
    
    def create_features(self):
        df = pd.read_csv(os.path.join(self.config.data_path, 'creditcard.csv'))
        ## rescaling Amount and time
        df['Amount'] = RobustScaler().fit_transform(df['Amount'].to_numpy().reshape(-1,1))
        time = df['Time']
        df['Time'] = (time-time.min())/(time.max()-time.min())
        df = df.sample(frac=1, random_state=1)

        df_np = df.to_numpy()
        y=df_np[:,-1]
        X = df_np[:,:-1]
        X_train, X_temp, y_train, y_temp = train_test_split(X,y, test_size=0.3, random_state=42)
        X_test,X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
        logger.info(f"Train Size: {len(y_train)}, Test Size: {len(y_test)}, Val Size: {len(y_val)}")

        train_df = pd.DataFrame(X_train)
        train_df['Class'] = y_train
        
        test_df = pd.DataFrame(X_test)
        test_df['Class'] = y_test

        val_df = pd.DataFrame(X_val)
        val_df['Class'] = y_val

        fraud = df.query('Class == 1')
        not_fraud = df.query('Class == 0')

        balanced_df = pd.concat([fraud, not_fraud.sample(len(fraud), random_state=100)])
        logger.info(f"Created a balanced df of size: {len(balanced_df)}")
        return balanced_df, test_df, val_df
    
    def convert(self):
        train, test, val = self.create_features()
        train.to_csv(os.path.join(self.config.root_dir,'train.csv'))
        test.to_csv(os.path.join(self.config.root_dir,'test.csv'))
        val.to_csv(os.path.join(self.config.root_dir,'val.csv'))
        

In [48]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2025-03-27 06:59:34,815: INFO: common: yaml file: config\config.yaml loaded succesfully]
[2025-03-27 06:59:34,820: INFO: common: yaml file: params.yaml loaded succesfully]
[2025-03-27 06:59:34,824: INFO: common: created directory at : artifacts]
[2025-03-27 06:59:34,828: INFO: common: created directory at : artifacts/data_transformation]
[2025-03-27 06:59:41,308: INFO: 238895040: Train Size: 199364, Test Size: 42721, Val Size: 42722]
[2025-03-27 06:59:41,457: INFO: 238895040: Created a balanced df of size: 984]
