In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
#Set notebook preferences
pd.set_option('display.float_format', lambda x: '%.3f' % x)

plt.style.use('Solarize_Light2')

In [3]:
import yaml
path= r'/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/paysim_credit_fraud_analysis/'
config_name= 'config.yaml'

#read yaml file
with open(os.path.join(path, config_name)) as file:
    config = yaml.safe_load(file)
    
df= pd.read_csv(config['paths']['cleanedData'], dtype={'isFraud':'int', 'isFlaggedFraud':'int'})
df.drop('Unnamed: 0', axis= 1, inplace= True)

**Data Overview**

In [4]:
df

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.640,170136.000,160296.360,0.000,0.000,0,0
1,1,PAYMENT,1864.280,21249.000,19384.720,0.000,0.000,0,0
2,1,TRANSFER,181.000,181.000,0.000,0.000,0.000,1,0
3,1,CASH_OUT,181.000,181.000,0.000,21182.000,0.000,1,0
4,1,PAYMENT,11668.140,41554.000,29885.860,0.000,0.000,0,0
...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.130,339682.130,0.000,0.000,339682.130,1,0
6362616,743,TRANSFER,6311409.280,6311409.280,0.000,0.000,0.000,1,0
6362617,743,CASH_OUT,6311409.280,6311409.280,0.000,68488.840,6379898.110,1,0
6362618,743,TRANSFER,850002.520,850002.520,0.000,0.000,0.000,1,0


**Preprocessing**

In [5]:
#Copy original df & drop flags cols
temp= df.copy().drop('isFlaggedFraud', axis=1)

#Create dummies
dummies= pd.get_dummies(temp.iloc[:, :-1], drop_first=False)
dummies.loc[:,'isFraud']= temp.loc[:,'isFraud'].apply(lambda x: int(x))

dummies.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,1,0,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,1,0,0
2,1,181.0,181.0,0.0,0.0,0.0,0,0,0,0,1,1
3,1,181.0,181.0,0.0,21182.0,0.0,0,1,0,0,0,1
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,1,0,0


In [6]:
# #Split data
# X= dummies.iloc[:,:-1].values
# y= dummies.iloc[:,-1].values

# #Create training and test data
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 24, test_size=.1)


# #Sanity check
# for item in [X_train, X_test, y_train, y_test]:
#     print(item.shape)

In [7]:
#How to handle such few fraud cases

In [8]:
config['paths']['pyScripts']

'/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/paysim_credit_fraud_analysis/src'

In [9]:
os.chdir(config['paths']['pyScripts'])
from models.metrics import classification_metrics

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   oldbalanceOrg   float64
 4   newbalanceOrig  float64
 5   oldbalanceDest  float64
 6   newbalanceDest  float64
 7   isFraud         int64  
 8   isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 436.9+ MB


In [11]:
classification_metrics(predictions=df['isFlaggedFraud'].values,
                      actual= df['isFraud'].values)

{'accuracy': 0.999, 'precision': 1.0, 'recall': 0.002, 'f1_score': 0.004}

In [12]:
#Evaluate previous model predictions

Save trained model for Deployment

In [None]:
path= r''