In [1]:
from __future__ import print_function
from copy import deepcopy
import numpy as np
import torch as th
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.modules.loss import CrossEntropyLoss, MSELoss
import torch.functional as F
from torch.optim import SGD
from torchvision import datasets
import my

In [2]:
N_TRAIN, N_TEST = 50000, 10000
# reduce dimension via a random projection
# D = 10
# train_data, train_labels, test_data, test_labels = my.unbalanced_mnist(N_TRAIN, N_TEST, D=D)
# reduce dimension via PCA
train_data, train_labels, test_data, test_labels = my.unbalanced_mnist(N_TRAIN, N_TEST, pca=True)
D = train_data.size()[1]

In [3]:
def predict(classifier, X):
    return th.max(classifier(X), 1)[1]

def accuracy(y_bar, y):
    return th.sum(((y_bar - y) == 0).float()) / float(y.size()[0])

In [4]:
# a c optimized by cross-entropy loss
c = nn.Linear(D, 2)
optim = SGD(c.parameters(), lr=0.001)
N_ITERATIONS = 10000
for i in range(N_ITERATIONS):
    loss = CrossEntropyLoss(size_average=True)(c(train_data), train_labels)
    if (i + 1) % 1000 == 0:
        a = accuracy(predict(c, train_data), train_labels)
        print('[iteration %d]cross-entropy loss: %f, accuracy: %f' % ((i + 1), float(loss[0]), float(a[0])))
    optim.zero_grad()
    loss.backward()
    optim.step()

[iteration 1000]cross-entropy loss: 0.468184, accuracy: 0.864980
[iteration 2000]cross-entropy loss: 0.315142, accuracy: 0.966840
[iteration 3000]cross-entropy loss: 0.241794, accuracy: 0.978700
[iteration 4000]cross-entropy loss: 0.200385, accuracy: 0.980720
[iteration 5000]cross-entropy loss: 0.174001, accuracy: 0.981340
[iteration 6000]cross-entropy loss: 0.155707, accuracy: 0.981600
[iteration 7000]cross-entropy loss: 0.142242, accuracy: 0.981780
[iteration 8000]cross-entropy loss: 0.131885, accuracy: 0.981900
[iteration 9000]cross-entropy loss: 0.123646, accuracy: 0.982220
[iteration 10000]cross-entropy loss: 0.116921, accuracy: 0.982420


In [5]:
def tp(y_bar, y): # true positive
    return th.sum((y_bar * y).float())

def fp(y_bar, y): # false positive
    return th.sum((y_bar * (1 - y)).float())

def fn(y_bar, y): # false negative
    return th.sum(((1 - y_bar) * y).float())

def precision(y_bar, y):
    tp_, fp_ = tp(y_bar, y), fp(y_bar, y)
    # TODO
    return tp_ / (tp_ + fp_ + 1)

def recall(y_bar, y):
    tp_, fn_ = tp(y_bar, y), fn(y_bar, y)
    return tp_ / (tp_ + fn_ + 1)

def f_beta(y_bar, y, beta=1):
    p, r = precision(y_bar, y), recall(y_bar, y)
    return (1 + beta ** 2) * p * r / (beta ** 2 * p + r + 1)

In [6]:
# baseline performance measures
y_bar = predict(classifier, test_data)
accuracy_ = accuracy(y_bar, test_labels)
precision_ = precision(y_bar, test_labels)
recall_ = recall(y_bar, test_labels)
f1 = f_beta(y_bar, test_labels)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy_, precision_, recall_, f1))))

accuracy: 0.983500, precision: 0.951219, recall: 0.874618, f1: 0.588819


In [7]:
SAMPLE_SIZE = 64
STD = 1

def sample(X, y):
    X, y = X.data.numpy(), y.data.numpy()
    idx = np.random.randint(0, len(X) - 1, SAMPLE_SIZE)
    X, y = Variable(th.from_numpy(X[idx])), Variable(th.from_numpy(y[idx]))
    return X, y

def data(classifier, X):
    y = classifier(X)
    Xy = th.cat((X, y), 1)
    Xy = Xy.view(1, Xy.numel())
    W, b = classifier.weight, classifier.bias
    W, b = W.view(1, W.numel()), b.view(1, b.numel())
    return th.cat((Xy, W, b), 1)

def L(classifier, X, y):
    y_bar = predict(classifier, X)
    f1 = f_beta(y_bar, y)
    return f1

def perturb(classifier):
    perturbed = deepcopy(classifier)
    perturbed.weight.data += th.randn(perturbed.weight.data.size()) * STD
    perturbed.bias.data += th.randn(perturbed.bias.data.size()) * STD
    return perturbed

# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate

In [8]:
c = nn.Linear(D, 2) # the classifier
approx = nn.Sequential(
    nn.Linear((D + 2) * SAMPLE_SIZE + D * 2 + 2, 256),
    nn.ReLU(),
    nn.Linear(256, 1)
) # L_\theta
c_optim = SGD(c.parameters(), 0.01)
approx_optim = SGD(approx.parameters(), 0.01)

In [9]:
OUTER = 50000
INNER = 10
K = 10

for i in range(OUTER):
    total_mse = 0
    total_delta, total_delta_ = 0, 0
    for j in range(INNER):
        c_bar = perturb(c)
        delta = L(c, train_data, train_labels) - L(c_bar, train_data, train_labels) # \delta = L(c, D)-L(\bar{c},D)

        samples = [sample(train_data, train_labels) for _ in range(K)] # D_1, ..., D_K
        c_d = th.cat(map(lambda X: data(c, X), zip(*samples)[0]), 0) # (c, D_1), ..., (c, D_K)
        c_bar_d = th.cat(map(lambda X: data(c_bar, X), zip(*samples)[0]), 0) # (c_bar, D_1), ..., (c_bar, D_K)
        # \frac1K \sum_{i = 1}^K \delta_i, where \delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)
        delta_ = th.mean(approx(c_d) - approx(c_bar_d), 0)
        
        total_delta += abs(float(delta))
        total_delta_ += abs(float(delta_))

        # \arg \min_\theta (\delta - \frac1K \sum_{i = 1}^K \delta_i)^2
        mse = MSELoss()(delta_, delta)
        approx_optim.zero_grad()
        mse.backward()
        approx_optim.step()
        total_mse += float(mse)
    
#     if (i + 1) % 100 == 0:
#         print('[iteration %d]mse: %f, delta: %f, delta_: %f' % (
#             (i + 1), total_mse / (j + 1), total_delta / (j + 1), total_delta_ / (j + 1)))
        
    samples = [sample(train_data, train_labels) for _ in range(K)] # D_1, ..., D_K
    c_d = th.cat(map(lambda X: data(c, X), zip(*samples)[0]), 0) # (c, D_1), ..., (c, D_K)
    # \arg \min_c \frac1K \sum_{i = 1}^K L_\theta (c, D_i)
    objective = -th.mean(approx(c_d))
    c_optim.zero_grad()
    objective.backward()
    c_optim.step()
    
    if (i + 1) % 1000 == 0:
        y_bar = predict(c, test_data)
        f1 = f_beta(y_bar, test_labels)
        print('[iteration %d]objective: %f, f1: %f' % ((i + 1), float(objective), float(f1)))

[iteration 1000]objective: 0.013376, f1: 0.153235
[iteration 2000]objective: 0.021679, f1: 0.176113
[iteration 3000]objective: 0.046296, f1: 0.196153
[iteration 4000]objective: 0.025103, f1: 0.213899
[iteration 5000]objective: 0.026911, f1: 0.226312
[iteration 6000]objective: 0.026471, f1: 0.243124
[iteration 7000]objective: 0.036381, f1: 0.261363
[iteration 8000]objective: 0.019537, f1: 0.277310
[iteration 9000]objective: 0.058926, f1: 0.295101
[iteration 10000]objective: 0.065837, f1: 0.313041
[iteration 11000]objective: 0.040371, f1: 0.331998
[iteration 12000]objective: 0.083660, f1: 0.354511
[iteration 13000]objective: 0.076736, f1: 0.374691
[iteration 14000]objective: 0.074458, f1: 0.395334
[iteration 15000]objective: 0.107419, f1: 0.409446
[iteration 16000]objective: 0.075362, f1: 0.426654
[iteration 17000]objective: 0.026898, f1: 0.443105
[iteration 18000]objective: 0.088195, f1: 0.459822
[iteration 19000]objective: 0.043067, f1: 0.477762
[iteration 20000]objective: 0.094444, f1

In [10]:
y_bar = predict(c, test_data)
accuracy_ = accuracy(y_bar, test_labels)
precision_ = precision(y_bar, test_labels)
recall_ = recall(y_bar, test_labels)
f1 = f_beta(y_bar, test_labels)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy_, precision_, recall_, f1))))

accuracy: 0.984000, precision: 0.917431, recall: 0.917431, f1: 0.593807
