In [1]:
from __future__ import print_function
import numpy as np
import numpy.random as npr
import torch as th
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.modules.loss import CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from torch.optim import SGD, Adam
import my

In [2]:
N_TRAIN, N_TEST = 0, 0
train_data, train_labels, test_data, test_labels = my.unbalanced_dataset(
    'MNIST', N_TRAIN, N_TEST, pca=False, p=[])

train_data_np, train_labels_np, test_data_np, test_labels_np = \
    train_data, train_labels, test_data, test_labels
    
train_data = th.from_numpy(train_data).float()
train_labels = th.from_numpy(train_labels).long()
test_data = th.from_numpy(test_data).float()
test_labels = th.from_numpy(test_labels).long()

cuda = True
if cuda:
    th.cuda.set_device(3)
    train_data, train_labels = train_data.cuda(), train_labels.cuda()
    test_data, test_labels = test_data.cuda(), test_labels.cuda()

train_data, train_labels, test_data, test_labels = \
    map(Variable, (train_data, train_labels, test_data, test_labels))

N_FEATURES = train_data.size()[1]
N_CLASSES = int(train_labels.max() - train_labels.min() + 1)

In [3]:
c = nn.Linear(N_FEATURES, N_CLASSES)
if cuda:
    c.cuda()
optim = Adam(c.parameters(), lr=0.001)
N_ITERATIONS = 10000
for i in range(N_ITERATIONS):
    loss = CrossEntropyLoss()(c(train_data), train_labels)
    if (i + 1) % 1000 == 0:
        accuracy = my.accuracy(my.predict(c, train_data), train_labels)
        print('[iteration %d]cross-entropy loss: %f, accuracy: %f' % ((i + 1), float(loss), float(accuracy)))
    optim.zero_grad()
    loss.backward()
    optim.step()

[iteration 1000]cross-entropy loss: 0.236208, accuracy: 0.934467
[iteration 2000]cross-entropy loss: 0.219137, accuracy: 0.939167
[iteration 3000]cross-entropy loss: 0.211611, accuracy: 0.941383
[iteration 4000]cross-entropy loss: 0.207340, accuracy: 0.942767
[iteration 5000]cross-entropy loss: 0.204689, accuracy: 0.943767
[iteration 6000]cross-entropy loss: 0.203005, accuracy: 0.943933
[iteration 7000]cross-entropy loss: 0.201929, accuracy: 0.944083
[iteration 8000]cross-entropy loss: 0.201244, accuracy: 0.944250
[iteration 9000]cross-entropy loss: 0.200819, accuracy: 0.944433
[iteration 10000]cross-entropy loss: 0.200560, accuracy: 0.944600


In [4]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.918700, precision: 0.917962, recall: 0.917628, f1: 0.917718


# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate

In [5]:
SAMPLE_SIZE = 64
BATCH_SIZE = 16

def L(classifier, X, y):
    y_bar = my.predict(classifier, X)
    return my.nd_f_beta(y_bar, y, N_CLASSES)

def forward(classifier, pair):
    X, y = pair
    y = my.onehot(y, N_CLASSES)
    y_bar = F.softmax(classifier(X), 1)
    return th.cat((y, y_bar), 1).view(1, -1)
    
def sample():
    samples = [my.sample_subset(train_data_np, train_labels_np, SAMPLE_SIZE) for k in range(BATCH_SIZE)]
    if cuda:
        samples = [(X.cuda(), y.cuda()) for (X, y) in samples]
    return [(Variable(X), Variable(y)) for (X, y) in samples]

In [6]:
c = nn.Linear(N_FEATURES, N_CLASSES)
# critic = my.MLP(((N_CLASSES +  N_CLASSES) * SAMPLE_SIZE,) + (1024,) * 3 +(1,), F.relu)
critic = my.RN(SAMPLE_SIZE, 2 * N_CLASSES, (1024,) * 3 + (1,), F.relu)

if cuda:
    c.cuda()
    critic.cuda()

# c_optim = SGD(c.parameters(), 0.1, momentum=0.9)
# critic_optim = SGD(critic.parameters(), 0.1, momentum=0.9)
c_optim = Adam(c.parameters(), 1e-3)
critic_optim = Adam(critic.parameters(), 1e-3)

float(my.nd_f_beta(my.predict(c, test_data), test_labels, N_CLASSES))

0.10084491968154907

In [7]:
STD = 0.05
OUTER = 500
INNER = 10

for i in range(OUTER):
    for j in range(INNER):
        c_bar = my.perturb(c, STD)
        delta = L(c, train_data, train_labels) - L(c_bar, train_data, train_labels)

        samples = sample()
        y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
        y_bar = th.cat(tuple(map(lambda x: forward(c_bar, x), samples)), 0)
        delta_ = th.mean(critic(y) - critic(y_bar), 0)
        
        mse = MSELoss()(delta_, delta)
        critic_optim.zero_grad()
        mse.backward()
        critic_optim.step()
    
    samples = sample()
    y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
    objective = -th.mean(critic(y))
    c_optim.zero_grad()
    objective.backward()
    c_optim.step()
    
    if (i + 1) % 10 == 0:
        y_bar = my.predict(c, test_data)
        f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
        print('[iteration %d]mse: %f, objective: %f, f1: %f' % ((i + 1), float(mse), float(objective), float(f1)))

[iteration 10]mse: 0.000616, objective: -0.668115, f1: 0.089080
[iteration 20]mse: 0.000251, objective: -0.445720, f1: 0.093592
[iteration 30]mse: 0.000299, objective: -0.063844, f1: 0.126089
[iteration 40]mse: 0.000223, objective: -0.326191, f1: 0.154079
[iteration 50]mse: 0.000038, objective: 0.502596, f1: 0.235320
[iteration 60]mse: 0.000053, objective: 0.263235, f1: 0.298056
[iteration 70]mse: 0.001560, objective: -0.245873, f1: 0.418190
[iteration 80]mse: 0.001454, objective: -0.030150, f1: 0.531991
[iteration 90]mse: 0.000121, objective: 0.055552, f1: 0.573501
[iteration 100]mse: 0.000225, objective: -0.239900, f1: 0.597914
[iteration 110]mse: 0.000001, objective: -0.043828, f1: 0.617740
[iteration 120]mse: 0.000167, objective: 0.564130, f1: 0.632668
[iteration 130]mse: 0.000074, objective: -0.621435, f1: 0.649655
[iteration 140]mse: 0.000299, objective: -0.881000, f1: 0.751355
[iteration 150]mse: 0.000030, objective: 0.126243, f1: 0.765008
[iteration 160]mse: 0.000001, objective

In [8]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.913600, precision: 0.912556, recall: 0.912634, f1: 0.912259
