In [1]:
# Imports
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = os.path.abspath("../02_data")
images_path = os.path.abspath("../03_images")

if not os.path.exists(data_path):
    raise Exception("Run preprocessing first")

if not os.path.exists(images_path):
    os.mkdir(images_path)

In [3]:
df = pd.read_csv(os.path.join(data_path, "transactions.csv"), index_col="txId")
classes = df["class"].astype("category")
time_step = df["timeStep"]
df = df.drop(["class", "timeStep"], axis=1)

In [4]:
color_map = {"unknown": "silver", "licit": "green", "illicit": "red"}

# Stacked bar chart (same from paper)
class_counts = pd.crosstab(time_step, classes)
class_counts = class_counts[list(color_map.keys())]

plt.figure(figsize=(12, 5))
class_counts.plot(
    kind="bar", ax=plt.gca(), stacked=True, width=0.7, color=list(color_map.values())
)

plt.title("Class Counts per Time Step")
plt.xlabel("Time Step")
plt.ylabel("Count")
plt.legend(title="Class")

plt.tight_layout()
plt.savefig(os.path.join(images_path, "class_counts_per_time_step.png"))
plt.close()

del df, classes

In [5]:
target_classes = ["licit", "illicit"]

train_df = pd.read_csv(os.path.join(data_path, "transactions_train.csv"), index_col="txId")
test_df = pd.read_csv(os.path.join(data_path, "transactions_test.csv"), index_col="txId")
# Filter for only the target classes
train_df = train_df[train_df["class"].isin(target_classes)]
test_df = test_df[test_df["class"].isin(target_classes)]

train_classes = train_df["class"].astype("category")
test_classes = test_df["class"].astype("category")
train_df = train_df.drop(["class", "timeStep"], axis=1)
test_df = test_df.drop(["class", "timeStep"], axis=1)


# We standardize since some of the models are highly sensitive to variance.
scaler = StandardScaler().fit(train_df)
train_df = scaler.transform(train_df)
test_df = scaler.transform(test_df)

models = {
    "PCA": PCA(),
    "UMAP": UMAP(n_components=2),
}

# I think UMAP expects integer labels.
train_classes_encoded = train_classes.map({c: i for i, c in enumerate(target_classes)})

for name, model in models.items():
    print(name)
    print("\tFitting train set")
    model.fit(train_df, train_classes_encoded)
    print("\tTransforming test set")
    transformed_df = model.transform(test_df)
    transformed_classes = test_classes

    print("\tPlotting embedding")
    plt.figure(figsize=(8, 5))
    for c in target_classes:
        mask = transformed_classes == c
        plt.scatter(
            transformed_df[mask, 0],
            transformed_df[mask, 1],
            c=transformed_classes[mask].map(color_map),
            s=0.5,
        )

    plt.title(f"{name} Projection")
    plt.xlabel(f"{name} Axis 1")
    plt.ylabel(f"{name} Axis 2")
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(images_path, f"{name.lower()}_projection_2d.png"))
    plt.close()

PCA
	Fitting train set
	Transforming test set
	Plotting embedding
UMAP
	Fitting train set




	Transforming test set
	Plotting embedding


In [6]:
cumulative_var = np.cumsum(models["PCA"].explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(np.arange(len(cumulative_var)) + 1, cumulative_var, marker=".", color="blue")
for prop in [0.5, 0.8, 0.9, 0.95, 0.99]:
    # Find index of first components that explain 90% of variance
    x = np.argmax(cumulative_var >= prop) + 1
    y = prop
    color = "red"
    plt.vlines(x=x, ymin=0, ymax=y, color=color)
    plt.text(x, y + 0.02, f"{prop * 100:.0f}%", ha="right", color=color)
    plt.text(x, -0.01, f"n={x}", ha="center", va="top", color=color)

plt.title("Number of Principal Components vs Cumulative Explained Variance")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(images_path, "pca_cumulative_explained_variance.png"))
plt.close()