In [None]:
!curl -L -o creditcardfraud.zip https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud

In [None]:
import zipfile

with zipfile.ZipFile("creditcardfraud.zip", "r") as zip_ref:
    zip_ref.extractall(".")

In [None]:
import numpy as np
import pandas as pd
# import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [None]:
df = pd.read_csv('./creditcard.csv')
df.head()

In [None]:
df.columns[df.isnull().sum()>0]

In [None]:
sns.countplot(x='Class', data=df)
df['Class'].value_counts()

In [None]:

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df['Amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['Time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

In [None]:
df.head()

In [None]:
df = df.sample(frac=1, random_state=42)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
df = normal_distributed_df.sample(frac=1, random_state=42)

df.head()

In [None]:
sns.countplot(x='Class', data=df)
df['Class'].value_counts()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr())

In [None]:
df.corr()['Class'].sort_values(ascending=False).plot(kind='bar',figsize=(12,8))

In [None]:
f, axes = plt.subplots(ncols=5, figsize=(30,6))
sns.boxplot(x='Class',y='V14',data=df, ax=axes[0])
sns.boxplot(x='Class',y='V12',data=df, ax=axes[1])
sns.boxplot(x='Class',y='V10',data=df,  ax=axes[2])
sns.boxplot(x='Class',y='V4',data=df, ax=axes[3])
sns.boxplot(x='Class',y='V11',data=df, ax=axes[4])

In [None]:
f, axes = plt.subplots(ncols=2, figsize=(15,6))
sns.scatterplot(x='Class',y='V10',data=df, ax=axes[0])
sns.scatterplot(x='Class',y='V4',data=df, ax=axes[1])

In [None]:
ind_drop = df[((df['Class']==0) & (df['V10']>5)) | ((df['Class']==0) & (df['V4']>6))]
ind_drop

In [None]:
df = df.drop(ind_drop.index)

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
X.shape,y.shape

In [None]:
X

In [None]:
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X)
plt.grid(True)
plt.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 0), s=10, label='No Fraud')
plt.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 1), s=10, label='Fraud')
plt.title('t-SNE')
plt.legend()
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train/Test logistic regression model

In [None]:
log_model = LogisticRegression(solver='saga',max_iter=5000)
# Penalty Type
penalty = ['l1', 'l2']

# Use logarithmically spaced C values (recommended in official docs)
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty})
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

In [None]:
y_pred = grid_model.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred)).plot()

In [None]:
ConfusionMatrixDisplay.from_estimator(grid_model, X_test, y_test, normalize='true')

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
y_prob = y_pred = grid_model.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.grid()
plt.show()
plt.show()

## Train/Test SVM model

In [None]:
def cal_metrics(model, X_test, y_test):
    print("best_params_ = {}".format(model.best_params_))
    print("best_score_ = {}".format(model.best_score_))
    
    y_pred = model.predict(X_test)
    print("accuracy_score = {}".format(accuracy_score(y_test,y_pred)))
    print(classification_report(y_test,y_pred))
    ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred)).plot()
    ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, normalize='true')

    # calculate ROC
    y_prob = y_pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()
    plt.show()

In [None]:
from sklearn.svm import SVC

In [None]:
svc_model = SVC(probability=True)

param_grid = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_model = GridSearchCV(svc_model,param_grid)
grid_model.fit(X_train,y_train)
# grid_model.fit(X,y)

In [None]:
cal_metrics(grid_model, X_test, y_test)

## Train/Test decision tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier,plot_tree

In [None]:
decision_tree_model = DecisionTreeClassifier(random_state=42)

param_grid = {'max_leaf_nodes': range(2, 21), 'criterion': ['gini', 'entropy', 'log_loss']}
grid_model = GridSearchCV(decision_tree_model,param_grid)
grid_model.fit(X_train,y_train)
# grid_model.fit(X,y)

In [None]:
cal_metrics(grid_model, X_test, y_test)

In [None]:
plt.figure(figsize=(12,8), dpi=200)
plot_tree(grid_model.best_estimator_,filled=True,feature_names=X.columns);