## Modeling

After the wrangling and preprocessing we can now begin our modeling. I will be utilizing Random Forest Classifier due to its robustness and accuracy that can handle classification problems. We will first import all whole libraries, packages and processed data

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import class_weight
from sklearn.metrics import make_scorer, f1_score

df = pd.read_csv("Wrangled_Payments.csv")



In [3]:
y = df["isFraud"]

X = df.drop("isFraud", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [21]:

clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train)

In [22]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [23]:
print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0          1.0  accuracy     macro avg  weighted avg
precision  9.999995e-01     1.000000  0.999999  9.999997e-01  9.999995e-01
recall     1.000000e+00     0.999287  0.999999  9.996435e-01  9.999995e-01
f1-score   9.999997e-01     0.999643  0.999999  9.998215e-01  9.999995e-01
support    3.682617e+06  2805.000000  0.999999  3.685422e+06  3.685422e+06
_______________________________________________
Confusion Matrix: 
 [[3682617       0]
 [      2    2803]]

Test Result:
Accuracy Score: 99.98%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0          1.0  accuracy     macro avg  weighted avg
precision  9.997988e-01     0.984142   0.99979  9.919703e-01  9.997866e-01
recall     9.999906e-01     0.742958   0.99979  8.714742e-01  9.997896e-01
f1-score   9.998947e-01     0.846709   0.99979  9.233021e-01  9.997749e-01
s

In [4]:

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf, 
    'bootstrap': bootstrap
}

rf_clf = RandomForestClassifier(random_state=42)
# TODO: Implement `RandomizedSearchCV` object
rf_cv = RandomizedSearchCV(
    estimator=rf_clf, 
    scoring='f1',
    param_distributions=random_grid, 
    n_iter=200, 
    cv=5, 
    verbose=1, 
    random_state=42,
    n_jobs=-1
)

rf_cv.fit(X_train, y_train)
rf_best_params = rf_cv.best_params_
print(f"Best paramters: {rf_best_params})")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
