In [1]:
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler



In [2]:
# Read csv file into dataframe
filepath = Path('Resources/subset_0.02.csv')
sample_dataframe = pd.read_csv(filepath, index_col=0)

sample_dataframe.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6119259,538,CASH_OUT,29832.89,C2129049627,20115.0,0.0,C1186246851,3601065.12,3630898.01,0,0
2880376,228,PAYMENT,12253.36,C189226541,0.0,0.0,M851931979,0.0,0.0,0,0
209982,13,CASH_OUT,363422.81,C1840683382,489494.01,126071.19,C2059327253,891476.53,1390155.98,0,0
4682313,331,PAYMENT,5602.16,C1044679446,506459.64,500857.48,M909938358,0.0,0.0,0,0
1553083,154,CASH_OUT,11648.69,C1026355309,0.0,0.0,C492088409,29198.44,40847.13,0,0


In [3]:
sample_dataframe.shape

(127252, 11)

In [None]:
#sample_dataframe = df.sample(n=10000) -- this has been moved to a separate module
#sample_dataframe.head()

In [4]:
# Drop several columns from the DataFrame
sample_dataframe.drop('nameOrig', axis=1, inplace=True)
sample_dataframe.drop('nameDest', axis=1, inplace=True)
sample_dataframe.drop('isFlaggedFraud', axis=1, inplace=True)

# Review the updated DataFrame
sample_dataframe.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
6119259,538,CASH_OUT,29832.89,20115.0,0.0,3601065.12,3630898.01,0
2880376,228,PAYMENT,12253.36,0.0,0.0,0.0,0.0,0
209982,13,CASH_OUT,363422.81,489494.01,126071.19,891476.53,1390155.98,0
4682313,331,PAYMENT,5602.16,506459.64,500857.48,0.0,0.0,0
1553083,154,CASH_OUT,11648.69,0.0,0.0,29198.44,40847.13,0


## Begin Exploratory Data Analysis

In [5]:
# Use the describe function to learn more about each feature
sample_dataframe.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
count,127252.0,127252.0,127252.0,127252.0,127252.0,127252.0,127252.0
mean,242.93,181806.36,833982.54,855418.01,1108539.79,1235061.4,0.0
std,142.43,627305.2,2871910.4,2908634.42,3390705.68,3686276.51,0.04
min,1.0,0.23,0.0,0.0,0.0,0.0,0.0
25%,155.0,13495.92,0.0,0.0,0.0,0.0,0.0
50%,238.0,75509.98,13973.12,0.0,135539.56,219937.53,0.0
75%,335.0,209079.05,108697.0,146450.19,940910.24,1109575.78,0.0
max,738.0,51729490.03,38441831.6,38563401.41,327998074.22,328431698.23,1.0


In [6]:
# Use the info function to learn more about the nature of the DataFrame and its columns
sample_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127252 entries, 6119259 to 4801485
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            127252 non-null  int64  
 1   type            127252 non-null  object 
 2   amount          127252 non-null  float64
 3   oldbalanceOrg   127252 non-null  float64
 4   newbalanceOrig  127252 non-null  float64
 5   oldbalanceDest  127252 non-null  float64
 6   newbalanceDest  127252 non-null  float64
 7   isFraud         127252 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 8.7+ MB


In [7]:
# Encode the feature(s) that are categorical
sample_dataframe = pd.get_dummies(sample_dataframe)
# Review the encoded DataFrame
sample_dataframe

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
6119259,538,29832.89,20115.00,0.00,3601065.12,3630898.01,0,0,1,0,0,0
2880376,228,12253.36,0.00,0.00,0.00,0.00,0,0,0,0,1,0
209982,13,363422.81,489494.01,126071.19,891476.53,1390155.98,0,0,1,0,0,0
4682313,331,5602.16,506459.64,500857.48,0.00,0.00,0,0,0,0,1,0
1553083,154,11648.69,0.00,0.00,29198.44,40847.13,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5969772,406,10786.77,1295.00,0.00,0.00,0.00,0,0,0,0,1,0
1693937,159,1797.71,5869.53,4071.82,0.00,0.00,0,0,0,0,1,0
3234972,250,244908.46,10145615.13,10390523.59,323072.24,78163.78,0,1,0,0,0,0
2331095,188,2509.44,14664.00,12154.56,0.00,0.00,0,0,0,0,1,0


In [8]:
# Define the X features and the y target
X = sample_dataframe.drop(columns=['isFraud'])
y = sample_dataframe['isFraud']
# Determine how many of the target values are 'isFraud'
print(sample_dataframe.isFraud.value_counts())

0    127070
1       182
Name: isFraud, dtype: int64


In [9]:
# Split the DataFrame into train and test groups
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# Review the X_train set
X_train

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
77483,10,249028.01,0.00,0.00,424595.72,678009.49,0,1,0,0,0
146039,12,224070.93,2660.00,0.00,0.00,224070.93,0,1,0,0,0
1207375,133,483654.16,1515457.11,1999111.27,1007878.32,524224.16,1,0,0,0,0
2707226,211,41935.17,0.00,0.00,390559.96,432495.14,0,1,0,0,0
5835285,402,188621.35,0.00,0.00,373473.22,562094.57,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2933902,229,131141.35,143878.90,12737.56,1766666.79,1897808.14,0,1,0,0,0
2146191,184,217837.99,191.00,218028.99,0.00,0.00,1,0,0,0,0
638146,35,272003.38,0.00,0.00,958252.92,1230256.30,0,1,0,0,0
279956,15,10616.34,210169.52,199553.18,0.00,0.00,0,0,0,1,0


In [10]:
# Instantiate the StandardScaler and fit it to the X_train dataset
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [11]:
# Apply the scaler to the train and test features
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Instantiate the RandomOverSampler class
random_oversampler = RandomOverSampler(random_state=1)

In [13]:
# Apply the oversamler to the data
X_resampled, y_resampled = random_oversampler.fit_resample(X_train_scaled, y_train)

In [14]:
# Verify that the number of 'isfraud' equals the number of not 'isFraud'
pd.Series(y_resampled).value_counts()

1    88942
0    88942
Name: isFraud, dtype: int64

In [15]:
# Instantiate Lazy Predict Classification
clf = LazyClassifier(verbose=0, custom_metric=None)
models,predictions = clf.fit(X_resampled, X_test_scaled, y_resampled, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [39:24<00:00, 81.54s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.99               0.99     0.99      0.99   
LinearSVC                          0.95               0.97     0.97      0.97   
LogisticRegression                 0.94               0.96     0.96      0.97   
SGDClassifier                      0.94               0.96     0.96      0.97   
SVC                                0.95               0.95     0.95      0.97   
PassiveAggressiveClassifier        0.89               0.95     0.95      0.94   
CalibratedClassifierCV             0.89               0.94     0.94      0.94   
LGBMClassifier                     1.00               0.92     0.92      1.00   
Perceptron                         0.99               0.91     0.91      0.99   
XGBClassifier                      1.00               0.91     0.91      1.00   
ExtraTreesClassifier        


