In [1]:
import os

In [2]:
%pwd

'c:\\Users\\pcx\\Desktop\\Projects\\credit\\FraudApp\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\pcx\\Desktop\\Projects\\credit\\FraudApp'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    cleaned_data: Path
    unzip_data_dir: Path

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        ):

        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_cleaned_data_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning

        create_directories([config.root_dir])

        data_cleaning_config = DataCleaningConfig(
            root_dir=config.root_dir,
            cleaned_data=config.cleaned_data,
            unzip_data_dir = config.unzip_data_dir,
        )

        return data_cleaning_config

In [8]:
import os
from mlProject import logger
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler

In [25]:
class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.config = config


    def cleaned_data(self) -> pd.DataFrame:
      
        df = pd.read_csv(self.config.unzip_data_dir)

        rob_scaler = RobustScaler()

        df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
        df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

        df.drop(['Time','Amount'], axis=1, inplace=True)

        scaled_amount = df['scaled_amount']
        scaled_time = df['scaled_time']

        df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
        df.insert(0, 'scaled_amount', scaled_amount)
        df.insert(1, 'scaled_time', scaled_time)

        df = df.sample(frac=1)

        fraud_df = df.loc[df['Class'] == 1]
        non_fraud_df = df.loc[df['Class'] == 0][:492]

        normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

        new_df = normal_distributed_df.sample(frac=1, random_state=42)


        v14_fraud = new_df['V14'].loc[new_df['Class'] == 1].values
        q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
        print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
        v14_iqr = q75 - q25
        print('iqr: {}'.format(v14_iqr))

        v14_cut_off = v14_iqr * 1.5
        v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
        outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
        print('Feature V14 Outliers for Fraud Cases: {}'.format(len(outliers)))
        print('V10 outliers:{}'.format(outliers))

        new_df = new_df.drop(new_df[(new_df['V14'] > v14_upper) | (new_df['V14'] < v14_lower)].index)
        print('----' * 44)

        # -----> V12 removing outliers from fraud transactions
        v12_fraud = new_df['V12'].loc[new_df['Class'] == 1].values
        q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
        v12_iqr = q75 - q25

        v12_cut_off = v12_iqr * 1.5
        v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
        outliers = [x for x in v12_fraud if x < v12_lower or x > v12_upper]
        new_df = new_df.drop(new_df[(new_df['V12'] > v12_upper) | (new_df['V12'] < v12_lower)].index)

        # Removing outliers V10 Feature
        v10_fraud = new_df['V10'].loc[new_df['Class'] == 1].values
        q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
        v10_iqr = q75 - q25

        v10_cut_off = v10_iqr * 1.5
        v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
        outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
        new_df = new_df.drop(new_df[(new_df['V10'] > v10_upper) | (new_df['V10'] < v10_lower)].index)


        save_path = os.path.join(self.config.root_dir, "new_df.csv")
        new_df.to_csv(save_path, index=False)

        return new_df

In [26]:
try:
    config = ConfigurationManager()
    data_cleaning_config = config.get_cleaned_data_config()
    data_cleaned = DataCleaning(config=data_cleaning_config)
    data_cleaned.cleaned_data()
except Exception as e:
    raise e

[2023-08-08 22:46:25,393: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-08 22:46:25,398: INFO: common: created directory at: artifacts]
[2023-08-08 22:46:25,404: INFO: common: created directory at: artifacts/data_cleaning]
Quartile 25: -9.692722964972386 | Quartile 75: -4.282820849486865
iqr: 5.409902115485521
Feature V14 Outliers for Fraud Cases: 4
V10 outliers:[-18.4937733551053, -18.8220867423816, -18.0499976898594, -19.2143254902614]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [28]:
pd.read_csv("artifacts/data_cleaning/new_df.csv")

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,0.752603,0.393919,1.731591,0.101507,0.191458,3.891298,-0.401686,0.094574,-0.330199,0.104331,...,-0.188015,-0.164991,-0.663019,0.387081,-0.063736,-0.621421,-0.388404,-0.008604,-0.014052,0
1,-0.296793,0.749257,0.218810,2.715855,-5.111658,6.310661,-0.848345,-0.882446,-2.902079,0.939162,...,0.381682,1.083640,1.037324,0.062325,0.532490,-0.149145,0.639580,0.351568,-0.001817,1
2,0.670719,0.748470,-0.696826,0.117511,1.507643,-0.645674,0.091681,-0.761740,0.522479,-0.012734,...,0.121977,-0.119652,-0.544992,0.245942,0.026449,-0.642922,0.198944,0.061806,0.164818,0
3,2.868721,-0.171771,-0.440095,1.137239,-3.227080,3.242293,-2.033998,-1.618415,-3.028013,0.764555,...,0.895841,0.764187,-0.275578,-0.343572,0.233085,0.606434,-0.315433,0.768291,0.459623,1
4,-0.031719,-0.420682,-2.139051,1.394368,-0.612035,1.049327,-1.162102,-0.768219,-1.997237,0.574997,...,0.025427,0.696955,0.740003,-0.155115,-0.050607,0.268368,-0.469433,-0.405814,-0.152171,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,1.713407,0.429352,-2.628922,2.275636,-3.745369,1.226948,-1.132966,-1.256353,-1.752420,0.281736,...,0.133106,0.870730,1.269473,-0.265494,-0.480549,0.169665,0.096081,0.070036,0.063768,1
943,-0.293440,0.141273,-3.705856,4.107873,-3.803656,1.710314,-3.582466,1.469729,-9.621560,-11.913105,...,3.639603,-5.498772,2.941475,0.916236,-0.255504,-0.183835,-0.584539,-0.315484,-0.097223,1
944,-0.269825,0.335084,2.104008,0.198241,-1.918498,0.271973,0.797614,-0.457342,0.222594,-0.227586,...,-0.083658,-0.384035,-0.948381,0.266275,-0.044431,-0.168611,0.207939,-0.054431,-0.034495,0
945,0.319989,0.107626,-6.185857,7.102985,-13.030455,8.010823,-7.885237,-3.974550,-12.229608,4.971232,...,0.483930,2.502772,0.481691,0.480958,0.360319,-0.293354,-0.199193,-0.203917,0.398927,1
