In [1]:
import torch as th
from torch.nn.modules.loss import CrossEntropyLoss
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
import my
import lenet

In [2]:
class Args:
    pass
args = Args()

args.master_gpu = 0
args.n_epochs = 10

In [3]:
cuda = args.master_gpu >= 0
if cuda:
    th.cuda.set_device(args.master_gpu)

# train_x, train_y, test_x, test_y = my.load_mnist(partition=(0, 1, 10), rbg=True)
train_x, train_y, test_x, test_y = my.load_mnist(partition=(0, 1, 10), rbg=False)

train_loader = DataLoader(TensorDataset(train_x, train_y), 64, shuffle=True, drop_last=True)
test_loader = DataLoader(TensorDataset(test_x, test_y), 4096, drop_last=True)

n_classes = int(train_y.max() - train_y.min() + 1)

In [4]:
nd_f_beta = lambda y_bar, y: my.nd_f_beta(y_bar, y, n_classes)
nd_precision = lambda y_bar, y: my.nd_precision(y_bar, y, n_classes)
nd_recall = lambda y_bar, y: my.nd_recall(y_bar, y, n_classes)
stats = (my.accuracy, nd_f_beta, nd_precision, nd_recall)

In [5]:
c = my.MLP((28 * 28, n_classes), None)
# c = lenet.LeNet(F.tanh)
if cuda:
    c.cuda()
optim = Adam(c.parameters(), lr=0.001)
for i in range(args.n_epochs):
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
        ce = CrossEntropyLoss()(c(x), y)
        optim.zero_grad()
        ce.backward()
        optim.step()
    accuracy, precision, recall, f1 = my.global_stats(c, test_loader, stats)
    print('[epoch %d]accuracy: %f; precision: %f; recall: %f; f1: %f' % (i + 1, accuracy, precision, recall, f1))

[epoch 1]accuracy: 0.970215; precision: 0.927909; recall: 0.885567; f1: 0.974515
[epoch 2]accuracy: 0.986206; precision: 0.962371; recall: 0.946954; f1: 0.978309
[epoch 3]accuracy: 0.986572; precision: 0.963283; recall: 0.948531; f1: 0.978512
[epoch 4]accuracy: 0.989014; precision: 0.968925; recall: 0.964778; f1: 0.973118
[epoch 5]accuracy: 0.988525; precision: 0.967221; recall: 0.967227; f1: 0.967227
[epoch 6]accuracy: 0.990723; precision: 0.973763; recall: 0.969573; f1: 0.977999
[epoch 7]accuracy: 0.989868; precision: 0.971212; recall: 0.968854; f1: 0.973591
[epoch 8]accuracy: 0.989868; precision: 0.971364; recall: 0.966938; f1: 0.975840
[epoch 9]accuracy: 0.989868; precision: 0.971661; recall: 0.963690; f1: 0.979775
[epoch 10]accuracy: 0.990479; precision: 0.972999; recall: 0.969850; f1: 0.976178


# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate