In [1]:
from __future__ import print_function
import numpy as np
import numpy.random as npr
import torch as th
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.modules.loss import CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from torch.optim import SGD, Adam
import my

In [2]:
class MLP(nn.Module):
    def __init__(self, D, nonlinear):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(D[i], D[i + 1]) for i in range(len(D) - 1)])
        self.nonlinear = nonlinear
        self.expose = False
    
    def forward(self, x):
        if x.dim != 2:
            x = x.view(x.size()[0], -1)
        for i, linear in enumerate(self.linears):
            x = linear(x)
            if i < len(self.linears) - 1:
                x = self.nonlinear(x)
        return x

In [3]:
N_TRAIN, N_TEST = 0, 0
train_data, train_labels, test_data, test_labels = my.unbalanced_dataset(
    'MNIST', N_TRAIN, N_TEST, pca=True, p=[0, 1, 10])

# N, D = train_data.shape
# train_data = np.hstack((train_data, npr.randn(N, 512 - D)))
# N, D = test_data.shape
# test_data = np.hstack((test_data, npr.randn(N, 512 - D)))

train_data_np, train_labels_np, test_data_np, test_labels_np = \
    train_data, train_labels, test_data, test_labels
    
train_data = th.from_numpy(train_data).float()
train_labels = th.from_numpy(train_labels).long()
test_data = th.from_numpy(test_data).float()
test_labels = th.from_numpy(test_labels).long()

cuda = True
if cuda:
    th.cuda.set_device(3)
    train_data, train_labels = train_data.cuda(), train_labels.cuda()
    test_data, test_labels = test_data.cuda(), test_labels.cuda()

train_data, train_labels, test_data, test_labels = \
    map(Variable, (train_data, train_labels, test_data, test_labels))

N_FEATURES = train_data.size()[1]
N_CLASSES = int(train_labels.max() - train_labels.min() + 1)

In [4]:
# c = nn.Linear(N_FEATURES, N_CLASSES)
# if cuda:
#     c.cuda()
# optim = SGD(c.parameters(), lr=0.001)
# N_ITERATIONS = 10000
# for i in range(N_ITERATIONS):
#     loss = CrossEntropyLoss()(c(train_data), train_labels)
#     if (i + 1) % 1000 == 0:
#         accuracy = my.accuracy(my.predict(c, train_data), train_labels)
#         print('[iteration %d]cross-entropy loss: %f, accuracy: %f' % ((i + 1), float(loss), float(accuracy)))
#     optim.zero_grad()
#     loss.backward()
#     optim.step()

In [5]:
# y_bar = my.predict(c, test_data)
# accuracy = my.accuracy(y_bar, test_labels)
# precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
# recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
# f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
# print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate

In [6]:
SAMPLE_SIZE = 64

def L(classifier, X, y):
    y_bar = my.predict(classifier, X)
    return my.nd_f_beta(y_bar, y, N_CLASSES)

def forward(classifier, pair):
    X, y = pair
    y = my.onehot(y, N_CLASSES)
    y_bar = F.softmax(classifier(X), 1)
    return th.cat((y, y_bar), 1).view(1, -1)
    
def sample(K=10):
    samples = [my.sample_subset(train_data_np, train_labels_np, SAMPLE_SIZE) for k in range(K)]
    if cuda:
        samples = [(X.cuda(), y.cuda()) for (X, y) in samples]
    return [(Variable(X), Variable(y)) for (X, y) in samples]

In [7]:
c = nn.Linear(N_FEATURES, N_CLASSES)
approx = MLP(((N_CLASSES +  N_CLASSES) * SAMPLE_SIZE,) + (1024,) * 3 +(1,), F.relu)

if cuda:
    c.cuda()
    approx.cuda()

c_optim = SGD(c.parameters(), 0.1, momentum=0.9)
approx_optim = SGD(approx.parameters(), 0.1, momentum=0.9)
# c_optim = Adam(c.parameters(), 1e-3)
# approx_optim = Adam(approx.parameters(), 1e-3)

float(my.nd_f_beta(my.predict(c, test_data), test_labels, N_CLASSES))

0.4432142376899719

In [8]:
# TODO: DataLoader

STD = 0.1
OUTER = 500
INNER = 10

for i in range(OUTER):
    for j in range(INNER):
        c_bar = my.perturb(c, STD)
        delta = L(c, train_data, train_labels) - L(c_bar, train_data, train_labels)

        samples = sample()
        y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
        y_bar = th.cat(tuple(map(lambda x: forward(c_bar, x), samples)), 0)
        delta_ = th.mean(approx(y) - approx(y_bar), 0)
        
        mse = MSELoss()(delta_, delta)
        approx_optim.zero_grad()
        mse.backward()
        approx_optim.step()
    
    samples = sample()
    y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
    objective = -th.mean(approx(y))
    c_optim.zero_grad()
    objective.backward()
    c_optim.step()
    
    if (i + 1) % 100 == 0:
        y_bar = my.predict(c, test_data)
        f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
        print('[iteration %d]mse: %f, objective: %f, f1: %f' % ((i + 1), float(mse), float(objective), float(f1)))

[iteration 100]mse: 0.001892, objective: -0.156537, f1: 0.595296
[iteration 200]mse: 0.001339, objective: -0.339683, f1: 0.800161
[iteration 300]mse: 0.000741, objective: -0.131378, f1: 0.837810
[iteration 400]mse: 0.000001, objective: -0.061991, f1: 0.884278
[iteration 500]mse: 0.000104, objective: 0.114965, f1: 0.883307


In [9]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

accuracy: 0.964300, precision: 0.958653, recall: 0.832865, f1: 0.883307
