In [1]:
from __future__ import print_function
import numpy as np
import numpy.random as npr
import torch as th
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.modules.loss import CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from torch.optim import SGD, Adam
import my

In [2]:
N_TRAIN, N_TEST = 0, 0
train_data, train_labels, test_data, test_labels = my.unbalanced_cifar10(N_TRAIN, N_TEST, p=[0, 1, 10])

train_data_np, train_labels_np, test_data_np, test_labels_np = \
    train_data, train_labels, test_data, test_labels
    
train_data = th.from_numpy(train_data).float()
train_labels = th.from_numpy(train_labels).long()
test_data = th.from_numpy(test_data).float()
test_labels = th.from_numpy(test_labels).long()

cuda = True
if cuda:
    th.cuda.set_device(3)
    train_data, train_labels = train_data.cuda(), train_labels.cuda()
    test_data, test_labels = test_data.cuda(), test_labels.cuda()

train_data, train_labels, test_data, test_labels = \
    map(Variable, (train_data, train_labels, test_data, test_labels))

N_FEATURES = train_data.size()[1]
N_CLASSES = int(train_labels.max() - train_labels.min() + 1)

In [3]:
c = nn.Linear(N_FEATURES, N_CLASSES)
if cuda:
    c.cuda()
optim = SGD(c.parameters(), lr=0.001)
N_ITERATIONS = 5000
for i in range(N_ITERATIONS):
    loss = CrossEntropyLoss()(c(train_data), train_labels)
    if (i + 1) % 1000 == 0:
        accuracy = my.accuracy(my.predict(c, train_data), train_labels)
        print('[iteration %d]cross-entropy loss: %f, accuracy: %f' % ((i + 1), float(loss), float(accuracy)))
    optim.zero_grad()
    loss.backward()
    optim.step()

[iteration 1000]cross-entropy loss: 0.460464, accuracy: 0.874680
[iteration 2000]cross-entropy loss: 0.369053, accuracy: 0.907500
[iteration 3000]cross-entropy loss: 0.324189, accuracy: 0.910580
[iteration 4000]cross-entropy loss: 0.299162, accuracy: 0.911000
[iteration 5000]cross-entropy loss: 0.283824, accuracy: 0.911000


In [4]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.910200, precision: 0.800068, recall: 0.590111, f1: 0.624683


# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate

In [5]:
SAMPLE_SIZE = 64

def L(classifier, X, y):
    y_bar = my.predict(classifier, X)
    return my.nd_f_beta(y_bar, y, N_CLASSES)

def forward(classifier, pair):
    X, y = pair
    y = my.onehot(y, N_CLASSES)
    y_bar = F.softmax(classifier(X), 1)
    return th.cat((y, y_bar), 1).view(1, -1)
    
def sample(K=10):
    samples = [my.sample_subset(train_data_np, train_labels_np, SAMPLE_SIZE) for k in range(K)]
    if cuda:
        samples = [(X.cuda(), y.cuda()) for (X, y) in samples]
    return [(Variable(X), Variable(y)) for (X, y) in samples]

In [6]:
c = nn.Linear(N_FEATURES, N_CLASSES)
approx = my.MLP(((N_CLASSES +  N_CLASSES) * SAMPLE_SIZE,) + (2048,) * 3 +(1,), F.relu)

if cuda:
    c.cuda()
    approx.cuda()

c_optim = SGD(c.parameters(), 0.1, momentum=0.5)
approx_optim = SGD(approx.parameters(), 0.1, momentum=0.5)
# c_optim = Adam(c.parameters(), 1e-3)
# approx_optim = Adam(approx.parameters(), 1e-3)

float(my.nd_f_beta(my.predict(c, test_data), test_labels, N_CLASSES))

0.3951839804649353

In [7]:
STD = 0.05
OUTER = 2000
INNER = 10

for i in range(OUTER):
    for j in range(INNER):
        c_bar = my.perturb(c, STD)
        delta = L(c, train_data, train_labels) - L(c_bar, train_data, train_labels)

        samples = sample()
        y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
        y_bar = th.cat(tuple(map(lambda x: forward(c_bar, x), samples)), 0)
        delta_ = th.mean(approx(y) - approx(y_bar), 0)
        
        mse = MSELoss()(delta_, delta)
        approx_optim.zero_grad()
        mse.backward()
        approx_optim.step()
    
    samples = sample()
    y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
    objective = -th.mean(approx(y))
    c_optim.zero_grad()
    objective.backward()
    c_optim.step()
    
    if (i + 1) % 100 == 0:
        y_bar = my.predict(c, test_data)
        f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
        print('[iteration %d]mse: %f, objective: %f, f1: %f' % ((i + 1), float(mse), float(objective), float(f1)))
    
    if (i + 1) % 1000 == 0:
        STD /= 5

[iteration 100]mse: 0.000003, objective: -0.016467, f1: 0.391504
[iteration 200]mse: 0.000000, objective: -0.024306, f1: 0.504582
[iteration 300]mse: 0.002587, objective: 0.003403, f1: 0.477321
[iteration 400]mse: 0.000620, objective: -0.072644, f1: 0.497943
[iteration 500]mse: 0.003448, objective: -0.080197, f1: 0.494223
[iteration 600]mse: 0.000192, objective: -0.050866, f1: 0.583945
[iteration 700]mse: 0.000013, objective: -0.090536, f1: 0.507501
[iteration 800]mse: 0.003516, objective: -0.067265, f1: 0.497002
[iteration 900]mse: 0.000002, objective: -0.170600, f1: 0.654412
[iteration 1000]mse: 0.000010, objective: -0.121097, f1: 0.542154
[iteration 1100]mse: 0.000033, objective: -0.090863, f1: 0.680587
[iteration 1200]mse: 0.000110, objective: -0.085391, f1: 0.655270
[iteration 1300]mse: 0.000152, objective: -0.096738, f1: 0.662585
[iteration 1400]mse: 0.000002, objective: -0.085892, f1: 0.689449
[iteration 1500]mse: 0.000052, objective: -0.093614, f1: 0.683106
[iteration 1600]mse:

In [8]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.874900, precision: 0.668193, recall: 0.698056, f1: 0.681171
