# Title

### 1. Introduction



### 2. Dataset

In [None]:
import os
import time
import pprint
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [None]:
# load data
dataset = pd.read_csv('./dataset.csv')

### 3. Data preprocessing

In [None]:
dataset.sample(10)

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# dataset.dropna(inplace=True)

In [None]:
# d = dataset.convert_dtypes()

### 4. Data EDA

In [None]:
value_counts = dataset['Class'].value_counts().sort_index()
pie_data =  value_counts.values
pie_labels = value_counts.index

In [None]:
plt.figure(figsize = (5,5))
plt.title('Dataset Balance {Positive: 1, Negative: 0}')
plt.pie(pie_data, labels=pie_labels, autopct = '%0.0f%%', wedgeprops=dict(width=0.5))
plt.show()

In [None]:
print(value_counts)

### Feature Engineering

### Data Modeling


#### train_test_split

In [None]:
Y = dataset.Class.tolist()
X = dataset.drop(['Time','Amount','Class'], axis=1)

In [None]:
training_x, testing_x, training_y, testing_y = train_test_split (X, Y, test_size = 0.3)

#### initialize model

In [None]:
classifier = LogisticRegression()

In [None]:
def start_training(classifier, training_x, training_y):
    training_start_time = time.time()
    print(f'Training is started at Epoch Time in Second: {training_start_time}')

    classifier.fit(training_x, training_y)

    training_end_time = time.time()
    print(f'Training is ended at Epoch Time in Second: {training_end_time}')

    training_time = training_end_time - training_start_time
    return classifier, training_time

#### model training

In [None]:
clf, t = start_training(classifier, training_x, training_y)
print(f'Training is elapsed for {t:.3} seconds')

Training is started at Epoch Time in Second: 1653418214.1240196
Training is ended at Epoch Time in Second: 1653418217.2400277
Training is elapsed for 3.12 seconds


#### model testing

In [None]:
predicted_y = clf.predict(testing_x)

In [None]:
# confusion matrix
cm = confusion_matrix(testing_y, predicted_y)

In [None]:
def plot_confusion_matrix(cm, classes, normalize = False, title = 'Confusion matrix"', cmap = plt.cm.Blues) :
    plt.figure()
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)
 
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(
            j, i, cm[i, j],
            horizontalalignment = 'center',
            color = 'white' if cm[i, j] > thresh else 'black'
        )
 
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
# show our confusion matrix
label = [0, 1]
plot_confusion_matrix(cm, classes=label, title='Logistic Regression Confusion Matrix')

### Evaluate Result

In [None]:
tp = cm[1,1]
fn = cm[1,0]
fp = cm[0,1]
tn = cm[0,0]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
f1 = (precision*recall)/(precision+recall) *2
print(f'F1 score: {f1:.3f}')

In [None]:
print(classification_report(testing_y, predicted_y))

In [None]:
# ROC curve
y_pred_proba = classifier.predict_proba(testing_x)[::,1]
fpr, tpr, _ = roc_curve(testing_y,  y_pred_proba)

# create ROC curve
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
auc = roc_auc_score(testing_y, y_pred_proba)
print(f'AUC is {auc:.3f}')