In [1]:
import os
import pandas as pd

from dotenv import load_dotenv
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns

%load_ext autoreload
%autoreload 2

In [2]:
load_dotenv(override=True)

True

In [3]:
DIR_DATA_PROCESSED: str = os.getenv("DIR_DATA_PROCESSED")

In [4]:
df = pd.read_csv(f"{DIR_DATA_PROCESSED}/data.csv")

In [5]:
X = df.drop(columns="isFraud")
y = df["isFraud"]

In [6]:
def plot_strip(X: pd.Series, y: pd.Series, hue: pd.Series, figsize: tuple = (14, 9)):
    fig = plt.figure(figsize=figsize)
    colours = plt.cm.tab10(np.linspace(0, 1, 9))
    with sns.axes_style("ticks"):
        ax = sns.stripplot(
            X, y, hue=hue, jitter=0.4, marker=".", size=4, palette=colours
        )
        ax.set_xlabel("")
        ax.set_xticklabels(["genuine", "fraudulent"], size=16)
        for axis in ["top", "bottom", "left", "right"]:
            ax.spines[axis].set_linewidth(2)

        handles, labels = ax.get_legend_handles_labels()
        plt.legend(
            handles,
            ["Transfer", "Cash out"],
            bbox_to_anchor=(1, 1),
            loc=2,
            borderaxespad=0,
            fontsize=16,
        )
    return ax

In [7]:
ax = plot_strip(y, X["step"], X["type"])
ax.set_ylabel("time [hour]", size=16)
ax.set_title(
    "Striped vs. homogenous fingerprints of genuine and fraudulent \
transactions over time",
    size=20,
);

NameError: name 'np' is not defined

<Figure size 1008x648 with 0 Axes>

In [None]:
ax = plot_strip(y, X["amount"], X["type"], figsize=(14, 9))
ax.set_ylabel("amount", size=16)
ax.set_title(
    "Same-signed fingerprints of genuine \
and fraudulent transactions over amount",
    size=18,
);

In [None]:
ax = plot_strip(y, -X["errorBalanceDest"], X["type"], figsize=(14, 9))
ax.set_ylabel("- errorBalanceDest", size=16)
ax.set_title(
    "Opposite polarity fingerprints over the error in \
destination account balances",
    size=18,
);

In [None]:
# Long computation in this cell (~2.5 minutes)
x_ = "errorBalanceDest"
y_ = "step"
z_ = "errorBalanceOrig"
zOffset = 0.02

sns.reset_orig()  # prevent seaborn from over-riding mplot3d defaults

fig = plt.figure(figsize=(10, 12))
ax = fig.add_subplot(111, projection="3d")

ax.scatter(
    X.loc[y == 0, x_],
    X.loc[y == 0, y_],
    -np.log10(X.loc[y == 0, z_] + zOffset),
    c="g",
    marker=".",
    s=1,
    label="genuine",
)

ax.scatter(
    X.loc[y == 1, x_],
    X.loc[y == 1, y_],
    -np.log10(X.loc[y == 1, z_] + zOffset),
    c="r",
    marker=".",
    s=1,
    label="fraudulent",
)

ax.set_xlabel(x_, size=16)
ax.set_ylabel(y_ + " [hour]", size=16)
ax.set_zlabel("- log$_{10}$ (" + z_ + ")", size=16)
ax.set_title(
    "Error-based features separate out genuine and fraudulent \
transactions",
    size=20,
)

plt.axis("tight")
ax.grid(1)

not_fraud_marker = mlines.Line2D(
    [], [], linewidth=0, color="g", marker=".", markersize=10, label="genuine"
)
fraud_marker = mlines.Line2D(
    [], [], linewidth=0, color="r", marker=".", markersize=10, label="fraudulent"
)

plt.legend(
    handles=[not_fraud_marker, fraud_marker],
    bbox_to_anchor=(1.20, 0.38),
    frameon=False,
    prop={"size": 16},
);

In [None]:
non_fraud_correlation = X.loc[y == 0, X.columns != "step"].corr()
mask = np.zeros_like(non_fraud_correlation)
indices = np.triu_indices_from(non_fraud_correlation)
mask[indices] = True

grid_kws = {"width_ratios": (0.9, 0.9, 0.05), "wspace": 0.2}
f, (ax1, ax2, cbar_ax) = plt.subplots(1, 3, gridspec_kw=grid_kws, figsize=(14, 9))

cmap = sns.diverging_palette(220, 8, as_cmap=True)
ax1 = sns.heatmap(
    non_fraud_correlation,
    ax=ax1,
    vmin=-1,
    vmax=1,
    cmap=cmap,
    square=False,
    linewidths=0.5,
    mask=mask,
    cbar=False,
)
ax1.set_xticklabels(ax1.get_xticklabels(), size=16)
ax1.set_yticklabels(ax1.get_yticklabels(), size=16)
ax1.set_title("Genuine \n transactions", size=20)

fraud_correlation = X.loc[y == 1, X.columns != "step"].corr()
ax2 = sns.heatmap(
    fraud_correlation,
    vmin=-1,
    vmax=1,
    cmap=cmap,
    ax=ax2,
    square=False,
    linewidths=0.5,
    mask=mask,
    yticklabels=False,
    cbar_ax=cbar_ax,
    cbar_kws={"orientation": "vertical", "ticks": [-1, -0.5, 0, 0.5, 1]},
)
ax2.set_xticklabels(ax2.get_xticklabels(), size=16)
ax2.set_title("Fraudulent \n transactions", size=20)

cbar_ax.set_yticklabels(cbar_ax.get_yticklabels(), size=14);