In [7]:
import numpy as np
import torch
import graphlearning as gl

mnist_digits, mnist_labels = gl.datasets.load("mnist")

mnist_X = torch.tensor(mnist_digits, dtype=torch.float32).reshape(-1, 1, 28, 28) / 255.0
mnist_y = torch.tensor(mnist_labels, dtype=torch.long)

# cifar, cifar_labels = gl.datasets.load("cifar10")


# Goal

Datasets:
1. CIFAR-10
2. MNIST

Models:
1. Convolutional Features
2. ReLU Features
3. Fourier Features

Each model transforms the data to a feature matrix $[M_{TM} | M_{TU}]$ where $M_{TM}$ is the data matrix for the training set and $M_{TU}$ are the basis functions that we have not yet modeled. We will compute the best coefficients, $\tilde{c}$ of basis functions to model the labels on the modeled training set and the best coefficients, $c$, of all basis functions to model the labels on the whole training set. We will then compute the error $c_{err} = \tilde{c}-c^*$ where $c^*$ is the truncated version of $c$ to match the size of $\tilde{c}$. We initialize $c$ with the least-squares coefficients learned from the whole training set. Then we compute $\tilde{c}$ by solving the least-squares problem on the sampled training set. 

For each dataset, we will:
- Sample the features uniformly at random vs by leverage scores.
- Plot $||A||_2$, $||M_{TM}^+||_2$, and $||\tilde{c}-c^*||_2$ for the sampled features as a function of the number of sampled points.

We expect to see that leverage score sampling leads to a smaller error $||\tilde{c}-c^*||_2$ for the same number of sampled points.


## MNIST

The MNIST dataset consists of 70,000 images of handwritten digits (0-9) in grayscale with a resolution of 28x28 pixels. This gives us a $70,000 \times 784$ data matrix.
- A Convolutional Neural network will transform the data to a $70,000 \times 200$ matrix (by removing the last layer).
- A Random ReLU fully-connected network ($y({\textbf{t}}) = \sum_{k=1}^{200} w_k \sigma(\left<\textbf{t}, {\textbf{v}}_k\right>)$ with $\sigma(x) = \max(0,x)$ and $\textbf{v}_k$ being randomly initialized weights and $w_k$ being the learned coefficients) will transform the data to a $70,000 \times 200$ matrix.
- A Fourier fully-connected network ($y({\textbf{t}}) = \mathscr{R}(\sum_{k=1}^{200} w_k \exp(i\pi\left<\textbf{t}, {\textbf{v}}_k\right>)) = \sum_{k=1}^{200} w_k \cos(\pi\left<\textbf{t}, {\textbf{v}}_k\right>) = $ with $\textbf{v}_k$ being randomly initialized weights and $w_k$ being the learned coefficients) will transform the data to a $70,000 \times 200$ matrix.

In [8]:
# Random Fourier Features
def rff_features(X, features=200):
    N, *_ = X.shape
    X = X.reshape(N, -1)

    W = torch.randn(X.shape[1], features)

    return torch.cos(torch.pi * X @ W) / np.sqrt(features)  # Normalize

mnist_rff_features = rff_features(mnist_X, features=2000)

# Random ReLU Features
def relu_features(X, features=200):
    N, *_ = X.shape
    X = X.reshape(N, -1)
    W = torch.randn(X.shape[1], features)
    return torch.relu(X @ W)  / np.sqrt(features)

mnist_relu_features = relu_features(mnist_X, features=2000)


In [9]:
# Verify CNN accuracy on MNIST

from models.mnist_cnn import ConvNet, BASIS_FUNCTIONS
from torch.utils.data import TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

network = ConvNet()
network.load_state_dict(torch.load("models/mnist_cnn.pth", map_location=device))
network.eval()

def verify_mnist_cnn(model: ConvNet, device):
    model.to(device)
    indices = torch.randperm(mnist_X.shape[0])
    correct = 0
    total = 0
    test_loader = torch.utils.data.DataLoader(
        TensorDataset(mnist_X[indices], mnist_y[indices]),
    )
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds: torch.Tensor = model(xb)
            correct += (preds.argmax(1) == yb).sum().item()
            total += yb.size(0)
    print(f"Test accuracy: {correct / total:.4f}")
verify_mnist_cnn(network, device)

Test accuracy: 0.9213


We get an accuracy on the whole dataset of `0.9213`. Pretty good. Now we can embed the data using the convolutional layers of the network and use that as our feature matrix for sampling.

In [10]:
# Embed the data using the convolutional layers of the network
mnist_cnn_embedding = np.empty((mnist_X.shape[0], BASIS_FUNCTIONS))

with torch.no_grad():
    for batch_start in range(0, mnist_X.shape[0], 256):
        batch_end = min(batch_start + 256, mnist_X.shape[0])
        batch = mnist_X[batch_start:batch_end].to(device)
        embeddings = network.embed(batch).cpu().numpy()
        mnist_cnn_embedding[batch_start:batch_end] = embeddings

mnist_cnn_embedding.shape

(70000, 200)

In [None]:
import numpy as np
from matplotlib import pyplot as plt


def simulation(
        M: np.ndarray,
        embedding_label: str,
        modeled_basis_functions: int = 100,
        sample_points: int = 5000,
        trials: int = 10,
    ):
    print("Running simulation for", embedding_label)
    c_true = np.linalg.lstsq(M, mnist_y.numpy(), rcond=None)[0]
    M_TM = M[:, :modeled_basis_functions]

    x_axis = range(10, sample_points, 100)
    errors_random_avg = []
    errors_leverage_avg = []
    parameter_errors_random_avg = []
    parameter_errors_leverage_avg = []

    for n in x_axis:
        errors_random = []
        errors_leverage = []
        parameter_errors_random = []
        parameter_errors_leverage = []

        print(f"Sampling {n} points...")
        for _ in range(trials):
            random_indices = np.random.choice(M.shape[0], n, replace=False)
            leverage_scores = np.linalg.norm(np.linalg.qr(M_TM, mode='reduced')[0], axis=1) ** 2
            leverage_indices = np.argsort(-leverage_scores)[:n]

            # Compare Random vs Leverage Score Sampling
            M_TM_random = M_TM[random_indices, :]
            c_random = np.linalg.lstsq(M_TM_random, mnist_y.numpy()[random_indices], rcond=None)[0]
            y_random = M_TM @ c_random
            error_random = np.linalg.norm(mnist_y.numpy() - y_random) / np.linalg.norm(mnist_y.numpy())
            parameter_error_random = np.linalg.norm(c_true[:modeled_basis_functions] - c_random) / np.linalg.norm(c_true[:modeled_basis_functions])

            M_TM_leverage = M_TM[leverage_indices, :]
            c_leverage = np.linalg.lstsq(M_TM_leverage, mnist_y.numpy()[leverage_indices], rcond=None)[0]
            y_leverage = M_TM @ c_leverage
            error_leverage = np.linalg.norm(mnist_y.numpy() - y_leverage) / np.linalg.norm(mnist_y.numpy())
            parameter_error_leverage = np.linalg.norm(c_true[:modeled_basis_functions] - c_leverage) / np.linalg.norm(c_true[:modeled_basis_functions])

            errors_random.append(error_random)
            errors_leverage.append(error_leverage)
            parameter_errors_random.append(parameter_error_random)
            parameter_errors_leverage.append(parameter_error_leverage)
        
        errors_random_avg.append(np.mean(errors_random))
        errors_leverage_avg.append(np.mean(errors_leverage))
        parameter_errors_random_avg.append(np.mean(parameter_errors_random))
        parameter_errors_leverage_avg.append(np.mean(parameter_errors_leverage))
    
    return (x_axis, errors_random_avg, errors_leverage_avg, parameter_errors_random_avg, parameter_errors_leverage_avg, embedding_label)

In [None]:
data1 = simulation(mnist_cnn_embedding, "CNN", modeled_basis_functions=100, sample_points=5000, trials=10)
data2 = simulation(mnist_rff_features.numpy(), "RFF", modeled_basis_functions=100, sample_points=5000, trials=10)
data3 = simulation(mnist_relu_features.numpy(), "ReLU", modeled_basis_functions=100, sample_points=5000, trials=10)

In [None]:
def plot_data(
        x_axis,
        errors_random_avg,
        errors_leverage_avg,
        parameter_errors_random_avg,
        parameter_errors_leverage_avg,
        embedding_label: str,
    ):
    plt.semilogy(x_axis, errors_random_avg, label='Random Sampling')
    plt.semilogy(x_axis, errors_leverage_avg, label='Leverage Score Sampling')
    # plt.xscale('log')
    plt.xlabel('Number of Samples')
    plt.ylabel('Relative Error')
    plt.title(f'MNIST Active Learning: Random vs Leverage Score Sampling ({embedding_label})')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.show()

    plt.semilogy(x_axis, parameter_errors_random_avg, label='Random Sampling')
    plt.semilogy(x_axis, parameter_errors_leverage_avg, label='Leverage Score Sampling')
    plt.xlabel('Number of Samples')
    # plt.xscale('log')
    plt.ylabel('Relative Parameter Error')
    plt.title(f'MNIST Active Learning Parameter Error: Random vs Leverage Score Sampling ({embedding_label})')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.show()


plot_data(*data1)
plot_data(*data2)
plot_data(*data3)

## CIFAR-10

The CIFAR-10 dataset consists of 60,000 images in color with a resolution of 32x32 pixels, divided into 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck). This gives us a 60,000 x 32 x 32 x 3 = 60,000 x 3072 data matrix.