In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
file_path = pd.read_csv('new_cleaned_dataset.csv')

In [3]:
categorical_col = []
for column in file_path.columns:
    if file_path[column].dtype == object and len(file_path[column].unique()) <= 50:
        categorical_col.append(column)

In [4]:
label = LabelEncoder()
for column in categorical_col:
    file_path[column] = label.fit_transform(file_path[column])

In [5]:
file_path.head(10)

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,330218.42,20866.0,351084.42,452419.57,122201.15,0,False,False,False,0
1,11647.08,30370.0,18722.92,0.0,0.0,0,False,False,True,0
2,152264.21,106589.0,258853.21,201303.01,49038.8,0,False,False,False,0
3,1551760.63,0.0,0.0,3198359.45,4750120.08,0,False,False,False,1
4,78172.3,2921331.58,2999503.88,415821.9,337649.6,0,False,False,False,0
5,915.13,0.0,0.0,0.0,0.0,0,False,False,True,0
6,20603.87,0.0,0.0,558068.66,578672.53,0,True,False,False,0
7,58605.72,0.0,0.0,585494.94,644100.66,0,True,False,False,0
8,4865.11,0.0,0.0,0.0,0.0,0,False,False,True,0
9,118131.63,0.0,0.0,8131691.35,8476246.86,0,True,False,False,0


In [6]:
X = file_path.drop('isFraud', axis=1) #'isFraud'is the target variable that I am trying to predict. 
y = file_path['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [8]:
tree_clf = DecisionTreeClassifier(random_state=42)

tree_clf.fit(X_train, y_train)

In [9]:
X_train_encoded = pd.get_dummies(X_train)

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_encoded, y_train)

In [10]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.94%
_______________________________________________
CLASSIFICATION REPORT:
                      0      1  accuracy     macro avg  weighted avg
precision      0.999533   0.90    0.9994      0.949766      0.999367
recall         0.999866   0.72    0.9994      0.859933      0.999400
f1-score       0.999700   0.80    0.9994      0.899850      0.999367
support    14975.000000  25.00    0.9994  15000.000000  15000.000000
_______________________________________

In [11]:
params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}

tree_clf = DecisionTreeClassifier(random_state=42)

tree_cv = GridSearchCV(
    tree_clf, 
    params, 
    scoring="f1", 
    n_jobs=-1, 
    verbose=1, 
    cv=5
)

tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(X_train, y_train)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Fitting 5 folds for each of 4332 candidates, totalling 21660 fits
Best paramters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'splitter': 'best'})
Train Result:
Accuracy Score: 99.96%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999657   0.942857    0.9996      0.971257      0.999584
recall         0.999943   0.733333    0.9996      0.866638      0.999600
f1-score       0.999800   0.825000    0.9996      0.912400      0.999575
support    34955.000000  45.000000    0.9996  35000.000000  35000.000000
_______________________________________________
Confusion Matrix: 
 [[34953     2]
 [   12    33]]

Test Result:
Accuracy Score: 99.93%
_______________________________________________
CLASSIFICATION REPORT:
                      0      1  accuracy     macro avg  weighted avg
precision      0.999333   1.00  0.999333      0.999666      0.9

In [12]:
rf_clf = RandomForestClassifier(n_estimators=100)

rf_clf.fit(X_train, y_train)

In [13]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999533   1.000000  0.999533      0.999766      0.999534
recall         1.000000   0.720000  0.999533      0.860000      0.999533
f1-score       0.999766   0.837209  0.999533      0.918488      0.999495
support    14975.000000  25.000000  0.999533  15000.000000  15000.000000
___________________

In [16]:
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt', None]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf, 
    'bootstrap': bootstrap
}

rf_clf = RandomForestClassifier(random_state=42)
rf_cv = RandomizedSearchCV(
    estimator=rf_clf, 
    scoring='f1',
    param_distributions=random_grid, 
    n_iter=200, 
    cv=5, 
    verbose=1, 
    random_state=42,
    n_jobs=-1
)

rf_cv.fit(X_train, y_train)
rf_best_params = rf_cv.best_params_
print(f"Best paramters: {rf_best_params})")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [15]:
rf_clf = RandomForestClassifier(**rf_best_params)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 99.97%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999657   1.000000  0.999657      0.999828      0.999657
recall         1.000000   0.733333  0.999657      0.866667      0.999657
f1-score       0.999828   0.846154  0.999657      0.922991      0.999631
support    34955.000000  45.000000  0.999657  35000.000000  35000.000000
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [   12    33]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999466   1.000000  0.999467      0.999733      0.999467
recall         1.000000   0.680000  0.999467      0.840000      0.999467
f1-score       0.999733   0.809524  0.999467      0.904628      0.999416
support    14975.000000  25.