In [1]:
from __future__ import print_function
import numpy as np
import numpy.random as npr
import torch as th
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.modules.loss import CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from torch.optim import SGD, Adam
import my

In [2]:
N_TRAIN, N_TEST = 0, 0
train_data, train_labels, test_data, test_labels = my.unbalanced_dataset(
    'MNIST', N_TRAIN, N_TEST, pca=False, p=[0, 1, 10])

train_data_np, train_labels_np, test_data_np, test_labels_np = \
    train_data, train_labels, test_data, test_labels
    
train_data = th.from_numpy(train_data).float()
train_labels = th.from_numpy(train_labels).long()
test_data = th.from_numpy(test_data).float()
test_labels = th.from_numpy(test_labels).long()

cuda = True
if cuda:
    th.cuda.set_device(3)
    train_data, train_labels = train_data.cuda(), train_labels.cuda()
    test_data, test_labels = test_data.cuda(), test_labels.cuda()

train_data, train_labels, test_data, test_labels = \
    map(Variable, (train_data, train_labels, test_data, test_labels))

N_FEATURES = train_data.size()[1]
N_CLASSES = int(train_labels.max() - train_labels.min() + 1)

In [3]:
c = nn.Linear(N_FEATURES, N_CLASSES)
if cuda:
    c.cuda()
optim = SGD(c.parameters(), lr=0.001)
N_ITERATIONS = 10000
for i in range(N_ITERATIONS):
    loss = CrossEntropyLoss()(c(train_data), train_labels)
    if (i + 1) % 1000 == 0:
        accuracy = my.accuracy(my.predict(c, train_data), train_labels)
        print('[iteration %d]cross-entropy loss: %f, accuracy: %f' % ((i + 1), float(loss), float(accuracy)))
    optim.zero_grad()
    loss.backward()
    optim.step()

[iteration 1000]cross-entropy loss: 0.344358, accuracy: 0.908700
[iteration 2000]cross-entropy loss: 0.247047, accuracy: 0.951150
[iteration 3000]cross-entropy loss: 0.193739, accuracy: 0.968400
[iteration 4000]cross-entropy loss: 0.160498, accuracy: 0.976417
[iteration 5000]cross-entropy loss: 0.138098, accuracy: 0.980583
[iteration 6000]cross-entropy loss: 0.122109, accuracy: 0.983300
[iteration 7000]cross-entropy loss: 0.110164, accuracy: 0.984867
[iteration 8000]cross-entropy loss: 0.100913, accuracy: 0.986000
[iteration 9000]cross-entropy loss: 0.093537, accuracy: 0.986817
[iteration 10000]cross-entropy loss: 0.087518, accuracy: 0.987383


In [4]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.986600, precision: 0.951637, recall: 0.975291, f1: 0.963068


# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate

In [5]:
SAMPLE_SIZE = 64

def L(classifier, X, y):
    y_bar = my.predict(classifier, X)
    return my.nd_f_beta(y_bar, y, N_CLASSES)

def forward(classifier, pair):
    X, y = pair
    y = my.onehot(y, N_CLASSES)
    y_bar = F.softmax(classifier(X), 1)
    return th.cat((y, y_bar), 1).view(1, -1)
    
def sample(K=10):
    samples = [my.sample_subset(train_data_np, train_labels_np, SAMPLE_SIZE) for k in range(K)]
    if cuda:
        samples = [(X.cuda(), y.cuda()) for (X, y) in samples]
    return [(Variable(X), Variable(y)) for (X, y) in samples]

In [6]:
c = nn.Linear(N_FEATURES, N_CLASSES)
approx = my.MLP(((N_CLASSES +  N_CLASSES) * SAMPLE_SIZE,) + (1024,) * 3 +(1,), F.relu)

if cuda:
    c.cuda()
    approx.cuda()

c_optim = SGD(c.parameters(), 0.1, momentum=0.9)
approx_optim = SGD(approx.parameters(), 0.1, momentum=0.9)
# c_optim = Adam(c.parameters(), 1e-3)
# approx_optim = Adam(approx.parameters(), 1e-3)

float(my.nd_f_beta(my.predict(c, test_data), test_labels, N_CLASSES))

0.3194884955883026

In [7]:
STD = 0.1
OUTER = 2000
INNER = 10

for i in range(OUTER):
    for j in range(INNER):
        c_bar = my.perturb(c, STD)
        delta = L(c, train_data, train_labels) - L(c_bar, train_data, train_labels)

        samples = sample()
        y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
        y_bar = th.cat(tuple(map(lambda x: forward(c_bar, x), samples)), 0)
        delta_ = th.mean(approx(y) - approx(y_bar), 0)
        
        mse = MSELoss()(delta_, delta)
        approx_optim.zero_grad()
        mse.backward()
        approx_optim.step()
    
    samples = sample()
    y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
    objective = -th.mean(approx(y))
    c_optim.zero_grad()
    objective.backward()
    c_optim.step()
    
    if (i + 1) % 100 == 0:
        y_bar = my.predict(c, test_data)
        f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
        print('[iteration %d]mse: %f, objective: %f, f1: %f' % ((i + 1), float(mse), float(objective), float(f1)))

[iteration 100]mse: 0.009566, objective: -0.493887, f1: 0.594356
[iteration 200]mse: 0.000256, objective: -0.409504, f1: 0.637162
[iteration 300]mse: 0.006273, objective: -0.573395, f1: 0.660234
[iteration 400]mse: 0.000182, objective: -0.588699, f1: 0.591669
[iteration 500]mse: 0.000240, objective: -0.792452, f1: 0.635765
[iteration 600]mse: 0.001925, objective: -2.011851, f1: 0.664736
[iteration 700]mse: 0.000156, objective: -2.190856, f1: 0.637485
[iteration 800]mse: 0.000008, objective: -3.005857, f1: 0.878504
[iteration 900]mse: 0.000014, objective: -4.192799, f1: 0.883663
[iteration 1000]mse: 0.000091, objective: -4.566614, f1: 0.896868
[iteration 1100]mse: 0.000027, objective: -4.760266, f1: 0.922461
[iteration 1200]mse: 0.000024, objective: -5.938343, f1: 0.944596
[iteration 1300]mse: 0.000008, objective: -6.220542, f1: 0.956549
[iteration 1400]mse: 0.000012, objective: -6.416841, f1: 0.954157
[iteration 1500]mse: 0.000362, objective: -6.357320, f1: 0.965065
[iteration 1600]mse

In [8]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.989700, precision: 0.967909, recall: 0.974280, f1: 0.971062
