In [None]:
import joblib
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

In [2]:
base_path = "../02_data"
class_col = "class"
class_encoding = {"licit": 0, "illicit": 1}

X_train = pd.read_csv(
    os.path.join(base_path, "transactions_train.csv"), index_col="txId"
)
X_test = pd.read_csv(os.path.join(base_path, "transactions_test.csv"), index_col="txId")

X_train = X_train[X_train[class_col] != "unknown"].drop(columns=["timeStep"])
X_test = X_test[X_test[class_col] != "unknown"].drop(columns=["timeStep"])

y_train = X_train.pop("class").map(class_encoding)
y_test = X_test.pop("class").map(class_encoding)

# We standardize since some of the models are highly sensitive to variance.
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
output_path = "../04_training"

if not os.path.exists(output_path):
    os.mkdir(output_path)


def tune_and_train_model(clf, params):
    name = clf.__class__.__name__
    print(f"Training {name}")

    # Avoid recaulating model if it already exists (fairly expensive)
    clf_path = os.path.join(output_path, f"{name}.joblib")
    if os.path.exists(clf_path):
        return joblib.load(clf_path)

    grid = RandomizedSearchCV(
        clf,
        params,
        cv=5,
        n_iter=20,
        scoring="f1",
        n_jobs=-1,
        refit=True,
        verbose=True,
        random_state=42,
    ).fit(X_train, y_train)

    # Save to file
    with open(os.path.join(output_path, f"{name}_params.txt"), "w") as f:
        json.dump(grid.best_params_, f, indent=2)
    joblib.dump(grid.best_estimator_, clf_path)

    return grid.best_estimator_


def evaluate_model(clf):
    name = clf.__class__.__name__

    print(f"Evaluating {name}")
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]

    with open(os.path.join(output_path, f"{name}_metrics.txt"), "w") as f:
        f.write(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
        f.write(f"Precision: {precision_score(y_test, y_pred):.4f}\n")
        f.write(f"Recall: {recall_score(y_test, y_pred):.4f}\n")
        f.write(f"F1 Score: {f1_score(y_test, y_pred):.4f}\n")
        f.write(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\n")
        f.write(f"Classification Report:\n {classification_report(y_test, y_pred)}\n")

    return clf, y_pred

In [4]:
# Random Forest
rf, rf_pred = evaluate_model(
    tune_and_train_model(
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": [20, 50, 100, 250],
            "max_depth": [10, 25, 40, None],
            "min_samples_split": [2, 10, 20],
            "min_samples_leaf": [1, 2, 5, 10],
        },
    )
)

Training RandomForestClassifier
Evaluating RandomForestClassifier


In [5]:
lr, lr_pred = evaluate_model(
    tune_and_train_model(
        LogisticRegression(random_state=42),
        {
            "C": [0.001, 0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"],
            "max_iter": [100, 1000, None]
        },
    )
)

Training LogisticRegression
Evaluating LogisticRegression


In [6]:
embedding_path = "../03_images"


def plot_embedded_prediction(name, df: pd.DataFrame, preds):
    axis1 = df["axis1"]
    axis2 = df["axis2"]
    classes = df["class"].map({"licit": 0, "illicit": 1})
    _, axes = plt.subplots(len(preds), 1, figsize=(8, 5 * len(preds)), sharex=True, squeeze=True)

    for ax, (clf_name, pred) in zip(axes, preds):
        mask = classes == pred
        ax.scatter(axis1[mask], axis2[mask], c="silver", s=1, label = "Correct")
        mask = (classes != pred) & (classes == 0)
        ax.scatter(axis1[mask], axis2[mask], c="blue", s=1, label="False Positive")
        mask = classes != pred & (classes == 1)
        ax.scatter(axis1[mask], axis2[mask], c="red", s=1, label="False Negative")

        ax.set_title(f"{clf_name} Predictions: {name} Projection")
        ax.set_xlabel(f"{name} Axis 1")
        ax.set_ylabel(f"{name} Axis 2")
        ax.legend()
        ax.grid(True)
    

    plt.tight_layout()
    plt.savefig(os.path.join(output_path, f"{name.lower()}_projection_2d.png"))
    plt.close()


preds = [("Random Forest", rf_pred), ("Logistic Regression", lr_pred)]
plot_embedded_prediction(
    "PCA", pd.read_csv(os.path.join(embedding_path, "pca_test_embedding.csv")), preds
)
plot_embedded_prediction(
    "UMAP", pd.read_csv(os.path.join(embedding_path, "umap_test_embedding.csv")), preds
)