# Libraries

In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Mahine Learning
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Dataset

In [18]:
data = pd.read_csv('creditcard.csv')

# Analysis

In [35]:
n_transactions = data['Class'].count()
n_frauds = dados['Class'].sum()
n_ok_trans = n_transactions - n_frauds

print(f'Total number of transactions: {n_transactions}')
print(f'Frauds: {n_frauds} or {(n_frauds / n_transactions) * 100:.2f}%')
print(f'Normal transactions: {n_ok_trans} or {(n_ok_trans / n_transactions) * 100:.2f}%')

Total number of transactions: 284807
Frauds: 492 or 0.17%
Normal transactions: 284315 or 99.83%


## Def's

In [90]:
def execute_validator(X,y):
    validator = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    for train_id, test_id in validator.split(X, y):
        X_train, X_test = X[train_id], X[test_id]
        y_train, y_test = y[train_id], y[test_id]
    return X_train, X_test, y_train, y_test

In [91]:
%%time
def execute_tree_classifier(classifier, X_train, X_test, y_train):
    tree_classifier = classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    return y_pred

Wall time: 0 ns


In [92]:
def save_tree(classifier, name):
    plt.figure(figsize=(200,100))
    tree.plot_tree(classifier, filled=True, fontsize=14)
    plt.savefig(name)
    plt.close()

In [93]:
def validate_tree(y_test, y_pred):
    print(f'Accuracy Score \n{accuracy_score(y_test, y_pred)}\n')
    print(f'Confusion Matrix \n{confusion_matrix(y_test, y_pred)}\n')
    print(f'Precision Score \n{precision_score(y_test, y_pred)}\n')
    print(f'Recall Score \n{recall_score(y_test, y_pred)}\n')

## Model

In [94]:
# executing the split

X = dados.drop('Class', axis=1).values
y = dados['Class'].values
X_train, X_test, y_train, y_test = execute_validator(X, y)

In [95]:
# executing the tree classifier

tree_decision_classifier = tree.DecisionTreeClassifier()
y_pred_decision_tree = execute_tree_classifier(tree_decision_classifier, X_train, X_test, y_train)

In [114]:
# plotting decision tree

save_tree(tree_decision_classifier, 'decision_tree_1.png')

In [97]:
# validation

validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.9990871107053826

Confusion Matrix 
[[28420    14]
 [   12    35]]

Precision Score 
0.7142857142857143

Recall Score 
0.7446808510638298



## Parameters

In [102]:
print(tree_decision_classifier.get_depth())

21


In [105]:
# executing the tree classifier
## min_samples_leaf

tree_decision_classifier = tree.DecisionTreeClassifier(max_depth=10, random_state=0, min_samples_leaf=10)
y_pred_decision_tree = execute_tree_classifier(tree_decision_classifier, X_train, X_test, y_train)
validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.9993679997191109

Confusion Matrix 
[[28426    12]
 [    6    37]]

Precision Score 
0.7551020408163265

Recall Score 
0.8604651162790697



In [106]:
# executing the tree classifier
## max_depth

tree_decision_classifier = tree.DecisionTreeClassifier(max_depth=10, random_state=0)
y_pred_decision_tree = execute_tree_classifier(tree_decision_classifier, X_train, X_test, y_train)
validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.9994733330992591

Confusion Matrix 
[[28430    13]
 [    2    36]]

Precision Score 
0.7346938775510204

Recall Score 
0.9473684210526315



In [107]:
# executing the tree classifier
## max_depth

tree_decision_classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=0)
y_pred_decision_tree = execute_tree_classifier(tree_decision_classifier, X_train, X_test, y_train)
validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.999403110845827

Confusion Matrix 
[[28429    14]
 [    3    35]]

Precision Score 
0.7142857142857143

Recall Score 
0.9210526315789473



## Ensemble - Random Forest

In [110]:
%%time
random_forest_classifier = RandomForestClassifier(n_estimators=50, random_state=0, max_depth=10)
y_pred_decision_tree = execute_tree_classifier(random_forest_classifier, X_train, X_test, y_train)
validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.9995435553526912

Confusion Matrix 
[[28431    12]
 [    1    37]]

Precision Score 
0.7551020408163265

Recall Score 
0.9736842105263158



In [112]:
%%time
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=10)
y_pred_decision_tree = execute_tree_classifier(random_forest_classifier, X_train, X_test, y_train)
validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.9995084442259752

Confusion Matrix 
[[28430    12]
 [    2    37]]

Precision Score 
0.7551020408163265

Recall Score 
0.9487179487179487

Wall time: 2min 17s


In [116]:
save_tree(random_forest_classifier.estimators_[0], 'random_forest_1.png')
save_tree(random_forest_classifier.estimators_[1], 'random_forest_2.png')

In [119]:
%%time
random_forest_classifier = RandomForestClassifier(n_estimators=20, random_state=0, max_depth=5)
y_pred_decision_tree = execute_tree_classifier(random_forest_classifier, X_train, X_test, y_train)
validate_tree(y_pred_decision_tree, y_test)

Accuracy Score 
0.9994733330992591

Confusion Matrix 
[[28430    13]
 [    2    36]]

Precision Score 
0.7346938775510204

Recall Score 
0.9473684210526315

Wall time: 15.5 s


## adaboost

In [121]:
%%time
adaboost = AdaBoostClassifier(random_state=0)
y_pred_adaboost = execute_tree_classifier(adaboost, X_train, X_test, y_train)
validate_tree(y_pred_adaboost, y_test)

Accuracy Score 
0.9992626663389628

Confusion Matrix 
[[28428    17]
 [    4    32]]

Precision Score 
0.6530612244897959

Recall Score 
0.8888888888888888



In [122]:
save_tree(adaboost.estimators_[0], 'adaboost_1.png')

In [123]:
%%time
adaboost = AdaBoostClassifier(random_state=0, n_estimators=100)
y_pred_adaboost = execute_tree_classifier(adaboost, X_train, X_test, y_train)
validate_tree(y_pred_adaboost, y_test)

Accuracy Score 
0.999403110845827

Confusion Matrix 
[[28426    11]
 [    6    38]]

Precision Score 
0.7755102040816326

Recall Score 
0.8636363636363636

Wall time: 2min 17s


In [124]:
%%time
adaboost = AdaBoostClassifier(random_state=0, n_estimators=200)
y_pred_adaboost = execute_tree_classifier(adaboost, X_train, X_test, y_train)
validate_tree(y_pred_adaboost, y_test)

Accuracy Score 
0.9995435553526912

Confusion Matrix 
[[28429    10]
 [    3    39]]

Precision Score 
0.7959183673469388

Recall Score 
0.9285714285714286

Wall time: 4min 25s
