In [1]:
from __future__ import print_function
from copy import deepcopy
from threading import Condition, Thread
import numpy as np
import numpy.random as npr
import torch as th
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.modules.loss import CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader, TensorDataset
import my

In [2]:
N_TRAIN, N_TEST = 0, 0
train_data, train_labels, test_data, test_labels = my.unbalanced_cifar10(N_TRAIN, N_TEST, p=[])

train_data_np, train_labels_np, test_data_np, test_labels_np = \
    train_data, train_labels, test_data, test_labels
    
train_data = th.from_numpy(train_data).float()
train_labels = th.from_numpy(train_labels).long()
test_data = th.from_numpy(test_data).float()
test_labels = th.from_numpy(test_labels).long()

cuda = True
if cuda:
    th.cuda.set_device(3)

BATCH_SIZE = 64
train_loader = DataLoader(TensorDataset(train_data, train_labels), BATCH_SIZE)
test_loader = DataLoader(TensorDataset(test_data, test_labels), BATCH_SIZE)

N_FEATURES = train_data.size()[1]
N_CLASSES = int(train_labels.max() - train_labels.min() + 1)

KeyboardInterrupt: 

In [None]:
class CNN(nn.Module):
    def __init__(self, n_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 2, 1)
        self.conv2 = nn.Conv2d(3, 16, 2, 1)
        self.linear = nn.Linear(16, n_classes)
    
    def forward(self, x):
        if x.dim() != 4:
            x = x.view(-1, 3, 32, 32)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.avg_pool2d(x, 8)
        return self.linear(x.view(-1, 16))

In [11]:
# c = nn.Linear(N_FEATURES, N_CLASSES)
# c = my.MLP((N_FEATURES,) + (64,) * 3 + (N_CLASSES,), F.relu)
c = CNN(N_CLASSES)
if cuda:
    c.cuda()
optim = Adam(c.parameters(), lr=0.001)
N_ITERATIONS = 1000
for i in range(N_ITERATIONS):
    for i, (X, y) in enumerate(train_loader):
        if cuda:
            X, y = X.cuda(), y.cuda()
        X, y = Variable(X), Variable(y)
        loss = CrossEntropyLoss()(c(X), y)
        if (i + 1) % 1000 == 0:
            accuracy = my.accuracy(my.predict(c, test_data), test_labels)
            print('[iteration %d]cross-entropy loss: %f, accuracy: %f' % ((i + 1), float(loss), float(accuracy)))
        optim.zero_grad()
        loss.backward()
        optim.step()

False False


RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 'weight'

In [None]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))

# Algorithm

Let $c$ be a classifier and $D=\{(X_1, y_1),...,(X_N, y_N)\}$ be the set of training data. In order to minimize $L(c, D)$, where $L$ is a non-decomposable loss function, we introduce $L_\theta$, a parameterized approximation of $L(c, D)$, and update $c$ as follows:

1. Compute $\delta = L(c, D)-L(\bar{c},D)$, where $\bar{c}$ is obtained by stochastically perturbing the parameters of $c$

2. Randomly sample $K$ subsets, $D_1, ..., D_K$, of $D$ (these subsets may vary in cardinality)

3. Minimize $(\delta - \frac1K \sum_{i = 1}^K \delta_i)^2$ with respect to $\theta$, where $\delta_i = L_\theta(c, D_i) - L_\theta(\bar{c}, D_i)$

4. Repeat 1, 2, and 3 several times until $L_\theta$ becomes a satisfactory approximation of $L$ near $c$

5. Randomly sample $K'$ subsets, $D_1, ..., D_K'$, of $D$ and let $c \leftarrow c - \alpha \sum_{i = 1}^K \frac{\partial L_\theta}{\partial c} (c, D_i)$, where $\alpha$ is a positive learning rate

In [None]:
# def L(classifier, X, y):
#     y_bar = my.predict(classifier, X)
#     return my.nd_f_beta(y_bar, y, N_CLASSES)

L = lambda c, loader: my.global_stats(c, loader, my.nd_curry(my.nd_f_beta, N_CLASSES))

# gpus = [0, 1, 2, 3]
# train_data_dict = {gpu: train_data.cuda(gpu) for gpu in gpus}
# train_labels_dict = {gpu: train_labels.cuda(gpu) for gpu in gpus}

# def L_batch(c, std, n):
#     c_device = next(c.parameters()).get_device()
#     c = deepcopy(c).cpu()
#     c.train(False)
#     c_dict = {gpu: deepcopy(c).cuda(gpu) for gpu in gpus}
#     results = []
#     available_gpus = [gpu for gpu in gpus]
#     condition = Condition()
    
#     def target(gpu):
#         c, train_data, train_labels = c_dict[gpu], train_data_dict[gpu], train_labels_dict[gpu]
#         c_bar = my.perturb(c, std)
        
#         delta = L(c, train_data, train_labels) - L(c_bar, train_data, train_labels) # TODO
#         results.append((c_bar.cuda(c_device), delta.cuda(c_device).detach()))
#         with condition:
#             available_gpus.append(gpu)
#             condition.notify_all()
    
#     threads = []
#     with condition:
#         for i in range(n):
#             if not available_gpus:
#                 condition.wait()
#             gpu = available_gpus.pop()
#             threads.append(Thread(target=target, args=(gpu,)))
#             threads[-1].start()
#     for t in threads:
#         t.join()
#     return results

In [None]:
def forward(classifier, pair):
    X, y = pair
    y = my.onehot(y, N_CLASSES)
    y_bar = F.softmax(classifier(X), 1)
    return th.cat((y, y_bar), 1).view(1, -1)

def sample(sample_size, batch_size):
    samples = [my.sample_subset(train_data_np, train_labels_np, sample_size) for k in range(batch_size)]
    if cuda:
        samples = [(X.cuda(), y.cuda()) for (X, y) in samples]
    return [(Variable(X), Variable(y)) for (X, y) in samples]

In [None]:
SAMPLE_SIZE = 64

th.random.manual_seed(1)
th.cuda.manual_seed_all(1)

# c = nn.Linear(N_FEATURES, N_CLASSES)
# c = my.MLP((N_FEATURES,) + (512,) * 1 + (N_CLASSES,), F.relu)
c = CNN(N_CLASSES)
# critic = my.MLP(((N_CLASSES +  N_CLASSES) * SAMPLE_SIZE,) + (2048,) * 3 +(1,), F.relu)
critic = my.RN(SAMPLE_SIZE, 2 * N_CLASSES, (512,) * 3 + (1,), F.relu)

if cuda:
    c.cuda()
    critic.cuda()

# c_optim = SGD(c.parameters(), 0.1, momentum=0.5)
# critic_optim = SGD(critic.parameters(), 0.1, momentum=0.5)
c_optim = Adam(c.parameters(), 1e-3)
critic_optim = Adam(critic.parameters(), 1e-3)

float(my.nd_precision(my.predict(c, test_data), test_labels, N_CLASSES)), \
float(my.nd_recall(my.predict(c, test_data), test_labels, N_CLASSES)), \
float(my.nd_f_beta(my.predict(c, test_data), test_labels, N_CLASSES))

In [None]:
# TODO
from sklearn.metrics import precision_score, recall_score, f1_score
y_bar = my.predict(c, test_data).data.cpu().numpy()
precision_score(test_labels_np, y_bar, average='macro'), \
recall_score(test_labels_np, y_bar, average='macro'), \
f1_score(test_labels_np, y_bar, average='macro')

In [None]:
STD = 1e-1
OUTER = 100
INNER_ACTOR = 5
N_PERTURBATIONS = 25
INNER_CRITIC = 5
BATCH_SIZE = 8

for i in range(OUTER):
    for p in c.parameters():
        p.requires_grad = False
    L_c = L(c, train_data, train_labels)
    c_bar_list, target_list = [], []
    for j in range(N_PERTURBATIONS):
        c_bar_list.append(my.perturb(c, STD))
        target = L_c - L(c_bar_list[-1], train_data, train_labels)
        target_list.append(target[0])

    s = sample(SAMPLE_SIZE, BATCH_SIZE)
    y = th.cat([forward(c, x) for x in s], 0).detach()
    for j in range(INNER_CRITIC): # TODO mini-batch
        for c_bar, target in zip(c_bar_list, target_list):
            y_bar = th.cat([forward(c_bar, x) for x in s], 0).detach()
            delta = th.mean(critic(y) - critic(y_bar), 0)
            mse = MSELoss()(delta, target)
            critic_optim.zero_grad()
            mse.backward()
            critic_optim.step()

    for p in c.parameters():
        p.requires_grad = True
    c_parameters = deepcopy(tuple(c.parameters()))
    for j in range(INNER_ACTOR):
        y = th.cat([forward(c, x) for x in s], 0)
        objective = -th.mean(critic(y))
        c_optim.zero_grad()
        objective.backward()
        c_optim.step()
        if any(float(th.max(th.abs(p - q))) > STD for p, q in zip(c_parameters, c.parameters())):
            break

    if (i + 1) % 1 == 0:
        y_bar = my.predict(c, test_data)
        f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
        print('[iteration %d]mse: %f, objective: %f, f1: %f' % ((i + 1), float(mse), float(objective), float(f1)))

In [None]:
# STD = 0.05
# OUTER = 5000
# INNER_ACTOR = 5
# INNER_CRITIC = 20

# mse_list = []
# for i in range(OUTER):
#     targets = L_batch(c, STD, INNER_CRITIC)
#     for j in range(INNER_CRITIC):
#         c_bar, delta = targets[j]
#         samples = sample()
#         y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0).detach()
#         y_bar = th.cat(tuple(map(lambda x: forward(c_bar, x), samples)), 0).detach()
        
#         mse = 1
#         while float(mse) > 1e-3:
#             delta_ = th.mean(critic(y) - critic(y_bar), 0)
#             mse = MSELoss()(delta_, delta)
#             mse_list.append(mse)
#             critic_optim.zero_grad()
#             mse.backward()
#             critic_optim.step()

#     c_parameters = deepcopy(tuple(c.parameters()))
#     for j in range(INNER_ACTOR):
#         samples = sample()
#         y = th.cat(tuple(map(lambda x: forward(c, x), samples)), 0)
#         objective = -th.mean(critic(y))
#         c_optim.zero_grad()
#         objective.backward()
#         c_optim.step()
#         if any(float(th.max(th.abs(p - q))) > STD for p, q in zip(c_parameters, c.parameters())):
#             break

#     if (i + 1) % 1 == 0:
#         y_bar = my.predict(c, test_data)
#         f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
#         print('[iteration %d]mse: %f, objective: %f, f1: %f' % ((i + 1), float(mse), float(objective), float(f1)))

In [None]:
import matplotlib.pylab as pl
%matplotlib inline
pl.plot(range(len(mse_list)), list(map(float, mse_list)))

In [None]:
y_bar = my.predict(c, test_data)
accuracy = my.accuracy(y_bar, test_labels)
precision = my.nd_precision(y_bar, test_labels, N_CLASSES)
recall = my.nd_recall(y_bar, test_labels, N_CLASSES)
f1 = my.nd_f_beta(y_bar, test_labels, N_CLASSES)
print('accuracy: %f, precision: %f, recall: %f, f1: %f' % tuple(map(float, (accuracy, precision, recall, f1))))