In [78]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [79]:
df = pd.read_csv("../data/processed/financial_fraud_data.csv")

In [80]:
df.head(5)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,278,CASH_IN,330218.42,20866.0,351084.42,452419.57,122201.15,0,0
1,15,PAYMENT,11647.08,30370.0,18722.92,0.0,0.0,0,0
2,10,CASH_IN,152264.21,106589.0,258853.21,201303.01,49038.8,0,0
3,403,TRANSFER,1551760.63,0.0,0.0,3198359.45,4750120.08,0,0
4,206,CASH_IN,78172.3,2921331.58,2999503.88,415821.9,337649.6,0,0


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            50000 non-null  int64  
 1   type            50000 non-null  object 
 2   amount          50000 non-null  float64
 3   oldbalanceOrg   50000 non-null  float64
 4   newbalanceOrig  50000 non-null  float64
 5   oldbalanceDest  50000 non-null  float64
 6   newbalanceDest  50000 non-null  float64
 7   isFraud         50000 non-null  int64  
 8   isFlaggedFraud  50000 non-null  int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 3.4+ MB


From our data lets first identify categorical object data types with less than 50 unique values, then after that we want to convert the object type columns back into numeric identifiers using the LabelEncoder and in the end we want to use it to train our model.

In [82]:
# We are making a list of categorical columns where unique samples are less than 50.

categorical_columns = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 50:
        categorical_columns.append(column)

In [83]:
# We only appended the "type" column in this case because its the only column which type is "object" and the number of unique values is less than 50.
 
categorical_columns

['type']

In [84]:
# Next we want to use label encoders to help us transform categorical values into numerical labels.

label = LabelEncoder()
for column in categorical_columns:
    df[column] = label.fit_transform(df[column])

In [85]:
df.head(5)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,278,0,330218.42,20866.0,351084.42,452419.57,122201.15,0,0
1,15,3,11647.08,30370.0,18722.92,0.0,0.0,0,0
2,10,0,152264.21,106589.0,258853.21,201303.01,49038.8,0,0
3,403,4,1551760.63,0.0,0.0,3198359.45,4750120.08,0,0
4,206,0,78172.3,2921331.58,2999503.88,415821.9,337649.6,0,0


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            50000 non-null  int64  
 1   type            50000 non-null  int64  
 2   amount          50000 non-null  float64
 3   oldbalanceOrg   50000 non-null  float64
 4   newbalanceOrig  50000 non-null  float64
 5   oldbalanceDest  50000 non-null  float64
 6   newbalanceDest  50000 non-null  float64
 7   isFraud         50000 non-null  int64  
 8   isFlaggedFraud  50000 non-null  int64  
dtypes: float64(5), int64(4)
memory usage: 3.4 MB


In [87]:
# Now lets create a "X" and "y" training set. We want to define our feature Matrix and our Target variable
# Then we will use the train test split on our newly created X and y variable

X = df.drop("isFraud", axis=1)
y = df["isFraud"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [88]:
# Lets use the "print score" method to generate a report for us

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [89]:
# Now we are ready to create our simple DecisionTreeClassifier object.

tree_clf = DecisionTreeClassifier(random_state=42)

#Lets fit it on our newly created "X" and "y" dataset. 

tree_clf.fit(X_train, y_train)

In [90]:
# We will generate two reports. One for X_train which would be when train = True and one for X_test for when train = False

print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.91%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999399   0.800000  0.999133      0.899700      0.999067
recall         0.999733   0.640000  0.999133      0.819866      0.999133
f1-score       0.999566   0.711111  0.999133      0.855339      0.999085
support    14975.000000  25.000000  0.999133  15000.000000  15000.000000
___________________

In [91]:
# Now lets find the best possible hyperparameters, using GridSearchCV and print our results.

params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}

tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(
    tree_clf, 
    params, 
    scoring="f1", 
    n_jobs=-1, 
    verbose=1, 
    cv=5
)

tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(X_train, y_train)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Fitting 5 folds for each of 4332 candidates, totalling 21660 fits
Best paramters: {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'best'})
Train Result:
Accuracy Score: 99.96%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999743   0.900000  0.999629      0.949871      0.999614
recall         0.999886   0.800000  0.999629      0.899943      0.999629
f1-score       0.999814   0.847059  0.999629      0.923436      0.999618
support    34955.000000  45.000000  0.999629  35000.000000  35000.000000
_______________________________________________
Confusion Matrix: 
 [[34951     4]
 [    9    36]]

Test Result:
Accuracy Score: 99.92%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999399   0.842105    0.9992      0.9207

In [92]:
# Now lets create a 'RandomForestClassifier' with 100 trees and fit our random forest with our X and Y trained data.

rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)

In [93]:
# Print a report of how well our model performed.

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999466   1.000000  0.999467      0.999733      0.999467
recall         1.000000   0.680000  0.999467      0.840000      0.999467
f1-score       0.999733   0.809524  0.999467      0.904628      0.999416
support    14975.000000  25.000000  0.999467  15000.000000  15000.000000
___________________

In [95]:
# From our RandomForest model we will be searching for the best hyperparameters using RandomSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf, 
    'bootstrap': bootstrap
}

rf_clf = RandomForestClassifier(random_state=42)
rf_cv = RandomizedSearchCV(
    estimator=rf_clf, 
    scoring='f1',
    param_distributions=random_grid, 
    n_iter=200, 
    cv=5, 
    verbose=1, 
    random_state=42,
    n_jobs=-1
)

rf_cv.fit(X_train, y_train)
rf_best_params = rf_cv.best_params_
print(f"Best paramters: {rf_best_params})")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


460 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
317 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jacob/anaconda3/envs/phase1/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jacob/anaconda3/envs/phase1/lib/python3.9/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/jacob/anaconda3/envs/phase1/lib/python3.9/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/jacob/anaconda3/envs/phase1/lib/python3.9/site-packages/sklearn/utils/_param_v

Best paramters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True})


In [96]:
rf_clf = RandomForestClassifier(**rf_best_params)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999466   1.000000  0.999467      0.999733      0.999467
recall         1.000000   0.680000  0.999467      0.840000      0.999467
f1-score       0.999733   0.809524  0.999467      0.904628      0.999416
support    14975.000000  25.000000  0.999467  15000.000000  15000.000000
___________________