In [5]:
import os
import pandas as pd
import numpy as np

In [6]:
#Set notebook preferences
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [7]:
#Load modules
os.chdir('/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/paysim_credit_fraud_analysis/')
from src.data.import_export_data import load_config

#Import data
path= r'/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/paysim_credit_fraud_analysis/'
config_name= 'config.yaml'

config= load_config(config_name= config_name, path=path)

dtypes= {'isFraud':'bool', 'step':'object'}
rawData= pd.read_csv(config['paths']['cleanedData'] + 'processedData.csv', dtype= dtypes, index_col= [0])

**Data Overview**

In [8]:
display(rawData.shape)
display(rawData.head())

(6362620, 6)

Unnamed: 0,amount,isFraud,oldbalanceDest,oldbalanceOrg,step,type
0,9839.64,False,0.0,170136.0,1,PAYMENT
1,1864.28,False,0.0,21249.0,1,PAYMENT
2,181.0,True,0.0,181.0,1,TRANSFER
3,181.0,True,21182.0,181.0,1,CASH_OUT
4,11668.14,False,0.0,41554.0,1,PAYMENT


In [9]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6362620 entries, 0 to 6362619
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   amount          float64
 1   isFraud         bool   
 2   oldbalanceDest  float64
 3   oldbalanceOrg   float64
 4   step            object 
 5   type            object 
dtypes: bool(1), float64(3), object(2)
memory usage: 297.3+ MB


**Preprocess Data**

In [14]:
#Create ColumnTransformer Preprocessor for training data
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_features= list(X.select_dtypes(include='float64').columns)
num_transformer= StandardScaler()

cat_features= list(X.select_dtypes(exclude='float64').columns)
cat_transformer= OneHotEncoder(handle_unknown= 'ignore')

preprocessor= ColumnTransformer(transformers=[('num', num_transformer, numeric_features),
                                             ('cat', cat_transformer, cat_features)])

In [None]:
#Split data into X and y
X= rawData.drop(columns= 'isFraud', axis= 1)
y= rawData.loc[:, 'isFraud']

#Initialize Data object
from src.models.modeling import Data
baseData= Data(X, y)

#Sample 

In [16]:
#Init base models for evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

baseModels= [LogisticRegression(),
            RandomForestClassifier(),
            KNeighborsClassifier(),
            GaussianNB(),
            GradientBoostingClassifier(),
            AdaBoostClassifier()]

In [None]:
#Init scoring for ml models
from sklearn.metrics import confusion_matrix, fbeta_score, make_scorer

scoring= {'Recall':'recall', 'F2': make_scorer(fbeta_score, beta= 2)}

**Test base models on baseline data (No over/under sampling applied)**