In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
file_path = pd.read_csv('new_cleaned_dataset.csv')

In [5]:
categorical_col = []
for column in file_path.columns:
    if file_path[column].dtype == object and len(file_path[column].unique()) <= 50:
        categorical_col.append(column)

In [6]:
label = LabelEncoder()
for column in categorical_col:
    file_path[column] = label.fit_transform(file_path[column])

In [7]:
file_path.head(10)

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,0,330218.42,20866.0,351084.42,452419.57,122201.15,0
1,3,11647.08,30370.0,18722.92,0.0,0.0,0
2,0,152264.21,106589.0,258853.21,201303.01,49038.8,0
3,4,1551760.63,0.0,0.0,3198359.45,4750120.08,0
4,0,78172.3,2921331.58,2999503.88,415821.9,337649.6,0
5,3,915.13,0.0,0.0,0.0,0.0,0
6,1,20603.87,0.0,0.0,558068.66,578672.53,0
7,1,58605.72,0.0,0.0,585494.94,644100.66,0
8,3,4865.11,0.0,0.0,0.0,0.0,0
9,1,118131.63,0.0,0.0,8131691.35,8476246.86,0


In [8]:
X = file_path.drop('isFraud', axis=1) #'isFraud'is the target variable that I am trying to predict. 
y = file_path['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [11]:
tree_clf = DecisionTreeClassifier(random_state=42)

tree_clf.fit(X_train, y_train)

In [12]:
X_train_encoded = pd.get_dummies(X_train)

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_encoded, y_train)

In [13]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.93%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999466   0.850000  0.999267      0.924733      0.999217
recall         0.999800   0.680000  0.999267      0.839900      0.999267
f1-score       0.999633   0.755556  0.999267      0.877594      0.999226
support    14975.000000  25.000000  0.999267  15000.000000  15000.000000
___________________

In [14]:
params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}

tree_clf = DecisionTreeClassifier(random_state=42)

tree_cv = GridSearchCV(
    tree_clf, 
    params, 
    scoring="f1", 
    n_jobs=-1, 
    verbose=1, 
    cv=5
)

tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(X_train, y_train)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Fitting 5 folds for each of 4332 candidates, totalling 21660 fits
Best paramters: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'})
Train Result:
Accuracy Score: 99.97%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999743   0.972973  0.999714      0.986358      0.999708
recall         0.999971   0.800000  0.999714      0.899986      0.999714
f1-score       0.999857   0.878049  0.999714      0.938953      0.999700
support    34955.000000  45.000000  0.999714  35000.000000  35000.000000
_______________________________________________
Confusion Matrix: 
 [[34954     1]
 [    9    36]]

Test Result:
Accuracy Score: 99.91%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999399   0.800000  0.999133      0.8

In [15]:
rf_clf = RandomForestClassifier(n_estimators=100)

rf_clf.fit(X_train, y_train)

In [16]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0     1  accuracy  macro avg  weighted avg
precision      1.0   1.0       1.0        1.0           1.0
recall         1.0   1.0       1.0        1.0           1.0
f1-score       1.0   1.0       1.0        1.0           1.0
support    34955.0  45.0       1.0    35000.0       35000.0
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [    0    45]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999533   1.000000  0.999533      0.999766      0.999534
recall         1.000000   0.720000  0.999533      0.860000      0.999533
f1-score       0.999766   0.837209  0.999533      0.918488      0.999495
support    14975.000000  25.000000  0.999533  15000.000000  15000.000000
___________________

In [17]:
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf, 
    'bootstrap': bootstrap
}

rf_clf = RandomForestClassifier(random_state=42)
rf_cv = RandomizedSearchCV(
    estimator=rf_clf, 
    scoring='f1',
    param_distributions=random_grid, 
    n_iter=200, 
    cv=5, 
    verbose=1, 
    random_state=42,
    n_jobs=-1
)

rf_cv.fit(X_train, y_train)
rf_best_params = rf_cv.best_params_
print(f"Best paramters: {rf_best_params})")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


460 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
259 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\xiaoj\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\xiaoj\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\xiaoj\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\xiaoj\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

Best paramters: {'n_estimators': 1200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False})


In [18]:
rf_clf = RandomForestClassifier(**rf_best_params)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 99.97%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999714   1.000000  0.999714      0.999857      0.999714
recall         1.000000   0.777778  0.999714      0.888889      0.999714
f1-score       0.999857   0.875000  0.999714      0.937428      0.999696
support    34955.000000  45.000000  0.999714  35000.000000  35000.000000
_______________________________________________
Confusion Matrix: 
 [[34955     0]
 [   10    35]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999466   1.000000  0.999467      0.999733      0.999467
recall         1.000000   0.680000  0.999467      0.840000      0.999467
f1-score       0.999733   0.809524  0.999467      0.904628      0.999416
support    14975.000000  25.