In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import tqdm

from scipy.misc import imresize
import sys

In [None]:
import matplotlib.pyplot as plt

In [None]:
from torchvision import transforms

In [None]:
torch.set_default_tensor_type('torch.FloatTensor')

In [None]:
from mnist_data import load_dataset

In [None]:
def onehot_labels(a):
    b = np.zeros((a.size, a.max()+1))
    b[np.arange(a.size),a] = 1
    return b

In [None]:
class myDataSet(Dataset):
    def __init__(self, objects, labels, transform=None):
        assert len(objects) == len(labels)
        self.X = objects
        self.y = labels
        self.len = len(objects)
        self.transform = transform
    
    def __getitem__(self, idx):
        sample = self.X[idx], self.y[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample
    
    def __len__(self):
        return self.len

In [None]:
class Downsample(object):
    def __init__(self, p_down):
        self.p_down = p_down
    
    def __call__(self, sample):
        image, label = sample
        restored_image = image.reshape(28,28)
        image = imresize(restored_image, self.p_down, mode='F').ravel()
        return image, label

In [None]:
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, label = sample

        return torch.from_numpy(image).double(), torch.from_numpy(label).double()

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
X_train, X_test, X_val = [a.reshape((a.shape[0], -1)) for a in [X_train, X_test, X_val]]
y_train, y_test, y_val = [onehot_labels(a) for a in [y_train, y_test, y_val]]

shuffled_train_indices = np.arange(X_train.shape[0])
np.random.shuffle(shuffled_train_indices)
X_train, y_train = X_train[shuffled_train_indices], y_train[shuffled_train_indices]


In [None]:
data_test = myDataSet(X_test, y_test, transform=transforms.Compose([ToTensor()]))
data_val = myDataSet(X_val, y_val, transform=transforms.Compose([ToTensor()]))

In [None]:
data_val[0][0].shape

In [None]:
def get_predictions_on_dataset(model, dataset, batch_size=None, compute_accuracy=False, num_output=None):
    def _target_predictions(output, num_output):
        if num_output is None:
            return output
        else:
            return output[num_output]
        
    if batch_size is None:
        batch_size = len(dataset)
    data_loader = DataLoader(dataset, 
                             batch_size=batch_size,
                             shuffle=False, 
                             num_workers=2)
    raw_predictions = np.zeros((len(dataset), dataset[0][1].shape[0]))
    all_predictions = np.zeros(len(dataset))
    all_labels = np.zeros_like(all_predictions)
    
#     print(batch_size)
    
    with torch.no_grad():
        cur_start = 0
        for batch, labels in data_loader:
            _raw_prediction = _target_predictions(model(batch.cuda()), num_output)
            raw_predictions[cur_start:cur_start+batch_size] = np.array(_raw_prediction)
#             print(np.array(_raw_prediction).shape)
            predictions = np.array(_raw_prediction)
#             print(predictions.shape)
            predictions = np.argmax(predictions, axis=1)
            labels = np.argmax(np.array(labels), axis=1)
            all_predictions[cur_start:cur_start+batch_size] = predictions
            all_labels[cur_start:cur_start+batch_size] = labels
            cur_start += batch_size
    if compute_accuracy:
        val_acc = np.mean(all_labels==all_predictions, dtype='float')
        return raw_predictions, all_labels, val_acc
    else:
        return raw_predictions, all_labels


In [None]:
def cross_entropy(_input, target, size_average=True):
    """ Cross entropy that accepts soft targets
    Args:
         pred: predictions for neural network
         targets: targets, can be soft
         size_average: if false, sum is returned instead of mean

    Examples::

        _input = torch.FloatTensor([[1.1, 2.8, 1.3], [1.1, 2.1, 4.8]])
        _input = torch.autograd.Variable(out, requires_grad=True)

        target = torch.FloatTensor([[0.05, 0.9, 0.05], [0.05, 0.05, 0.9]])
        target = torch.autograd.Variable(y1)
        loss = cross_entropy(_input, target)
        loss.backward()
    """
    logsoftmax = torch.nn.LogSoftmax(-1)
    if size_average:
        return torch.mean(torch.sum(-target * logsoftmax(_input), dim=1))
    else:
        return torch.sum(torch.sum(-target * logsoftmax(_input), dim=1))

In [None]:
def MLP(d, m, q):
    model = torch.nn.Sequential()
    model.add_module('d1', torch.nn.Linear(d, m))
    model.add_module('a1', torch.nn.ReLU())
    model.add_module('d2', torch.nn.Linear(m, m))
    model.add_module('a2', torch.nn.ReLU())
    model.add_module('d3', torch.nn.Linear(m, q))
    model.add_module('a3', torch.nn.Softmax(-1))
    
    opt = torch.optim.RMSprop(model.parameters(), lr=1e-3)
    
    loss = cross_entropy
    
    return (model, opt, loss)


In [None]:
class TwoOutputsNN(torch.nn.Module):
    def __init__(self, d, m, q):
        super(TwoOutputsNN, self).__init__()
        self._lin1 = torch.nn.Linear(d, m)
        self._act1 = torch.nn.ReLU()
        self._lin2 = torch.nn.Linear(m, m)
        self._act2 = torch.nn.ReLU()
        self._lin3 = torch.nn.Linear(m, q)
        self._out_softmax = torch.nn.Softmax(-1)
        
        self._queue = [
            self._lin1,
            self._act1,
            self._lin2,
            self._act2,
            self._lin3
        ]
        
    def forward(self, x):
        result = x
        for layer in self._queue:
            result = layer(result)
        
        out1 = self._out_softmax(result)
        out2 = result
        return out1, out2
    
def get_teacher(d, m, q):
    model = TwoOutputsNN(d, m, q)
    opt = torch.optim.RMSprop(model.parameters(), lr=1e-3)
    
    loss = cross_entropy
    
    return (model, opt, loss)


In [None]:
def weighted_loss(base_loss,l):
    def loss_function(y_true, y_pred):
        return l*base_loss(y_true,y_pred)
    return loss_function


In [None]:
def Dist(d, m, q, L):
    def _hard_loss(_input, target, L):
        return (1.-L)*cross_entropy(_input, target)

    def _soft_loss(_input, target, L):
        return L*cross_entropy(_input, target)

    
    model = torch.nn.Sequential()
    model.add_module('d1', torch.nn.Linear(d, m))
    model.add_module('a1', torch.nn.ReLU())
    model.add_module('d2', torch.nn.Linear(m, m))
    model.add_module('a2', torch.nn.ReLU())
    model.add_module('d3', torch.nn.Linear(m, q))
    model.add_module('a3', torch.nn.Softmax(-1))
    
    hard_loss = _hard_loss
    soft_loss = _soft_loss

    
    opt = torch.optim.RMSprop(model.parameters(), lr=1e-3)
#     opt = torch.optim.Adam(model.parameters(), lr=3e-4)

    
    
    return (model, opt, hard_loss, soft_loss)


In [None]:
d = X_train[0].reshape(-1).shape[0]
m = 20
q = y_train.shape[1]

In [None]:
model, opt, loss = get_teacher(d, m, q)
model = model.double()
model = model.cuda()

In [None]:
N = 500

In [None]:
train_size = N
data_train = myDataSet(X_train[:N], y_train[:N], transform=transforms.Compose([ToTensor()]))

dataloader_train = DataLoader(data_train, 
                              batch_size=4,
                              shuffle=True, 
                              num_workers=1)

_t = tqdm.tnrange(200, desc='epoch')
for epoch in _t:
    sum_loss = np.zeros(len(dataloader_train), dtype='float64')
    for idx, (batch, label) in enumerate(tqdm.tqdm_notebook(dataloader_train, leave=False)):
        batch = batch.cuda()
        label = label.cuda()
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        predictions = model(batch)[0]
        loss_value = loss(predictions, label)
        loss_value.backward()
        opt.step()
        sum_loss[idx] = loss_value
    if not epoch % 10:
        val_acc = get_predictions_on_dataset(model, data_val, compute_accuracy=True, num_output=0)[-1]
    _t.set_postfix(sum_loss=sum_loss.mean(), val_acc=val_acc)

In [None]:
model

In [None]:

a = get_predictions_on_dataset(model, data_test, compute_accuracy=True, num_output=0)

In [None]:
model_test_acc = a[-1]
a[-1]

In [None]:
b = get_predictions_on_dataset(model, data_train, compute_accuracy=True, num_output=1)

In [None]:
b[-1]

In [None]:
def show_probs_hist(probs):
    plt.figure(figsize=(8, 6))
    for idx, prob in enumerate(probs):
        plt.bar(np.arange(len(prob)), prob, alpha=1./len(probs))
    plt.ylim((0., 1.))
    plt.xticks(np.arange(len(prob)))

    plt.show()

In [None]:
# for T in [1,2,5,10,20,50]:
#     _ls = ['dashed', 'dotted']
#     _fc = [(0, 0, 1, 0.5), (1, 0, 0, 0.3)]
#     f, axarr = plt.subplots(3, 3, figsize=(16,10))
#     axarr[0, 1].set_title('T = {}'.format(T))
#     for _i in range(9):
#         labels_soften = F.softmax(torch.Tensor(b[0][_i]/T), -1)
#         probs = [labels_soften, data_train[_i][1]]
#         ix, iy = int(int(_i)/3), (_i % 3)

#         for idx, prob in enumerate(probs):
#             axarr[ix, iy].bar(np.arange(len(prob)),prob, edgecolor='black', lw=1., ls=_ls[idx], fc=_fc[idx])



In [None]:
F.softmax(torch.Tensor(b[:3][0]/10), -1)

In [None]:
del num_try

In [None]:
batch_size = 50
transformed_data_train = myDataSet(X_train[:N], y_train[:N], transform=transforms.Compose([
                                                                                            Downsample(0.25),
                                                                                            ToTensor()
                                                                                          ]))
transformed_dataloader_train = DataLoader(transformed_data_train, 
                                          batch_size=batch_size,
                                          shuffle=False, 
                                          num_workers=1)

transformed_data_val = myDataSet(X_val, y_val, transform=transforms.Compose([
                                                                             Downsample(0.25),
                                                                             ToTensor()
                                                                            ]))

transformed_data_test = myDataSet(X_test, y_test, transform=transforms.Compose([
                                                                                Downsample(0.25),
                                                                                ToTensor()
                                                                                ]))

d = transformed_data_train[0][0].shape[0]

model_predictions = get_predictions_on_dataset(model, data_train, 50, num_output=1)[0]
model_predictions = torch.Tensor(model_predictions).double()

In [None]:
to_save = {
    'transformed_data_train': transformed_data_train,
    'transformed_dataloader_train': transformed_dataloader_train,
    'transformed_data_val': transformed_data_val,
    'transformed_data_test': transformed_data_test,
    'model_predictions': model_predictions,
    'model': model,
    'data_test': data_test,
    'data_val': data_val,
    'model_test_acc': model_test_acc
}

In [None]:
torch.save(to_save, 'saved_objects_mnist_{}_2.pcl'.format(N))

In [None]:
for num_try in tqdm.tnrange(0, 10):
    iofile = open('torch_version_logs/new_log_mnist_{}.txt'.format(num_try), 'w')

    # 
    # 
    for T in tqdm.tqdm_notebook([1,2,5,10,20,50], desc='t loop', leave=False):
        for L in tqdm.tqdm_notebook([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], desc='l loop', leave=False):
            labels_soften = F.softmax(model_predictions/T, -1)
    #         print(labels_soften)


            student, student_opt, hard_loss, soft_loss = Dist(d, m, q, L)
            student.double()
            student.cuda()

            _t = tqdm.tnrange(151, leave=False)
            for epoch in _t:
                cur_start = 0
                loss_hist = []
                for batch, label in tqdm.tqdm_notebook(transformed_dataloader_train, leave=False):
            #         print(cur_start, batch_size, labels_soften[cur_start:cur_start+batch_size].shape)
                    batch = batch.cuda()
                    label = label.cuda()
                    # Step 1. Remember that PyTorch accumulates gradients.
                    # We need to clear them out before each instance
                    student.zero_grad()

                    predictions = student(batch)
                    hard_loss = cross_entropy(predictions, label)
                    soft_loss = cross_entropy(predictions, labels_soften[cur_start:cur_start+batch_size].cuda())

                    total_loss = (1.-L)*hard_loss + L*soft_loss
                    total_loss.backward()
            #                 soft_loss.backward()
                    loss_hist.append(np.mean(np.array(total_loss.detach())))


                    student_opt.step()
                    cur_start += batch_size

            #     b = student.state_dict()
            #     print(all(np.array(b['_lin1.weight']).ravel() == np.array(a['_lin1.weight']).ravel()))
    #             if epoch % 25 == 0:
    #                 val_acc = get_predictions_on_dataset(student, transformed_data_val, compute_accuracy=True)[-1]
                _t.set_postfix(val_acc=val_acc, mean_loss = np.mean(loss_hist[-50:]))

            #         for param in student.parameters():
            #             print(param.grad.data.sum())
            #         import pdb; pdb.set_trace()
            acc_student = get_predictions_on_dataset(student, transformed_data_test, compute_accuracy=True)[-1]
            iofile.write(str([N, T, L, acc_student])+'\n')

    iofile.close()

In [None]:
suffix = '200'
N_grid = [str(500)]*12 + [str(300)]*12
num_try_grid = [str(i) for i in range(12)]*2
parameters = [' '.join((str(i%2+1), N_grid[i], num_try_grid[i], suffix)) for i in range(24)]
parameters

In [None]:
result_dict = dict()
for T, L in [(1., 0.), (5., 0.5)]:
    result_dict[(T, L)] = []
    for _i in tqdm.tnrange(10):
        student, student_opt, hard_loss, soft_loss = Dist(d, m, q, L)
        student.double()
        student.cuda()
        labels_soften = F.softmax(model_predictions/T, -1)

        _t = tqdm.tnrange(151, leave=False)
        for epoch in _t:
            cur_start = 0
            loss_hist = []
            for batch, label in tqdm.tqdm_notebook(transformed_dataloader_train, leave=False):
        #         print(cur_start, batch_size, labels_soften[cur_start:cur_start+batch_size].shape)
                batch = batch.cuda()
                label = label.cuda()
                # Step 1. Remember that PyTorch accumulates gradients.
                # We need to clear them out before each instance
                student.zero_grad()

                predictions = student(batch)
                hard_loss = cross_entropy(predictions, label)
                soft_loss = cross_entropy(predictions, labels_soften[cur_start:cur_start+batch_size].cuda())

                total_loss = (1.-L)*hard_loss + L*soft_loss
                total_loss.backward()
        #                 soft_loss.backward()
                loss_hist.append(np.mean(np.array(total_loss.detach())))


                student_opt.step()
                cur_start += batch_size

        #     b = student.state_dict()
        #     print(all(np.array(b['_lin1.weight']).ravel() == np.array(a['_lin1.weight']).ravel()))
            if epoch % 25 == 0:
                val_acc = get_predictions_on_dataset(student, transformed_data_val, compute_accuracy=True)[-1]
            _t.set_postfix(val_acc=val_acc, mean_loss = np.mean(loss_hist[-50:]))

        #         for param in student.parameters():
        #             print(param.grad.data.sum())
        #         import pdb; pdb.set_trace()
        acc_student = get_predictions_on_dataset(student, transformed_data_test, compute_accuracy=True)[-1]
        result_dict[(T, L)].append(acc_student)

In [None]:
1+1

In [None]:
list(map(np.mean, result_dict.values()))

In [None]:
for key, val in result_dict.items():
    print(key, np.mean(val))

In [None]:
for key, val in result_dict.items():
    print(key, np.mean(val))

In [None]:
result_dict

In [None]:
acc_student

In [None]:
acc_student

In [None]:
acc_student

In [None]:
get_predictions_on_dataset(model, data_test, compute_accuracy=True, num_output=0)[-1]

In [None]:
student, student_opt, hard_loss, soft_loss = Dist(d, m, q, L)
student.double()
student.cuda()


_t = tqdm.tnrange(151, leave=False)
for epoch in _t:
    cur_start = 0
    loss_hist = []
    for batch, label in tqdm.tqdm_notebook(transformed_dataloader_train, leave=False):
#         print(cur_start, batch_size, labels_soften[cur_start:cur_start+batch_size].shape)
        batch = batch.cuda()
        label = label.cuda()
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        student.zero_grad()

        predictions = student(batch)
        hard_loss = cross_entropy(predictions, label)
#         soft_loss = cross_entropy(predictions, labels_soften[cur_start:cur_start+batch_size].cuda())

#         total_loss = (1.-L)*hard_loss + L*soft_loss
#         total_loss.backward()
        hard_loss.backward()
#         loss_hist.append(np.mean(np.array(total_loss.detach())))


        student_opt.step()
        cur_start += batch_size

#     b = student.state_dict()
#     print(all(np.array(b['_lin1.weight']).ravel() == np.array(a['_lin1.weight']).ravel()))
    if epoch % 25 == 0:
        val_acc = get_predictions_on_dataset(student, transformed_data_val, compute_accuracy=True)[-1]
    _t.set_postfix(val_acc=val_acc, mean_loss = np.mean(loss_hist[-50:]))

#         for param in student.parameters():
#             print(param.grad.data.sum())
#         import pdb; pdb.set_trace()
acc_student = get_predictions_on_dataset(student, transformed_data_test, compute_accuracy=True)[-1]
result_dict[(T, L)].append(acc_student)

In [None]:
logs = np.zeros((len(logs), 4), dtype=float)

In [None]:
pattern = '[\d+, \d+\.?\d+, \d+. \d ]'

In [None]:
def _parse_logs(log_str):
    log_str = log_str.replace('[', '').replace(']', '').replace('\n', '').replace(' ', '')
    return np.array([float(x) for x in log_str.split(',')])

In [None]:
a = _parse_logs(raw_logs[0])

In [None]:
logs = np.array(list(map(_parse_logs, raw_logs)))

In [None]:
line_coords = dict()
for T in list(set(logs[:, 1])):
    line_coords[T] = logs[logs[:, 1] == T][:, 2:]

plt.figure(figsize=(16,10))
for T, line_coords in line_coords.items():
    plt.plot(line_coords[:, 0], line_coords[:, 1], label=T)

plt.grid()
plt.legend()
plt.xlabel('imitation parameter')
plt.ylabel('test accuracy')
plt.title('MNIST 500 images')



In [None]:
labels_soften = F.softmax(model_predictions/T, -1)
print(labels_soften)

In [None]:
cross_entropy(predictions, labels_soften[cur_start:cur_start+batch_size])

In [None]:
predictions.shape

In [None]:
labels_soften[cur_start:cur_start+batch_size]

In [None]:
labels_soften.shape

In [None]:
cur_start

In [None]:
transformed_data_test = myDataSet(X_test, y_test, transform=transforms.Compose([
                                                                             Downsample(0.25),
                                                                             ToTensor()
                                                                            ]))


In [None]:
get_predictions_on_dataset(student, transformed_data_test, compute_accuracy=True)[-1]


In [None]:
try:
    num_try += 1
except NameError:
    num_try = 0
    
iofile = open('torch_version_logs/log_mnist_{}.txt'.format(num_try), 'w')


for T in tqdm.tqdm_notebook([1,2,5,10,20,50], desc='t loop'):
    for L in tqdm.tqdm_notebook([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], desc='l loop', leave=False):
        student, hard_opt, soft_opt, hard_loss, soft_loss = Dist(d, m, q, T, L)
        student.double()
        student.cuda()
        
        
        
        
        _t = tqdm.tnrange(5, leave=False)
        for epoch in _t:
            cur_start = 0
            for batch, label in tqdm.tqdm_notebook(transformed_dataloader_train, leave=False):
                batch = batch.cuda()
                label = label.cuda()
                # Step 1. Remember that PyTorch accumulates gradients.
                # We need to clear them out before each instance
                student.zero_grad()

                predictions = student(batch)
                hard_loss_value = hard_loss(predictions, label, T, L)
                soft_loss_value = soft_loss(predictions, labels_soften[cur_start:cur_start+batch_size].cuda(), L)

                hard_loss_value.backward(retain_graph=True)
                soft_loss_value.backward()

                hard_opt.step()
                soft_opt.step()
                cur_start += batch_size

            val_acc = get_predictions_on_dataset(student, transformed_data_val, compute_accuracy=True)[-1]
            _t.set_postfix(val_acc=val_acc)

                
        acc_student = get_predictions_on_dataset(student, transformed_data_test, compute_accuracy=True)[-1]
        iofile.write(str([N, T, L, acc_student])+'\n')
        
iofile.close()

In [None]:
!tail log_mnist_0.txt

In [None]:
for rep in range(10):
    # random training split
    i     = np.random.permutation(ax_tr.shape[0])[0:N]
    x_tr  = ax_tr[i]
    y_tr  = ay_tr[i]
    xs_tr = downsample(x_tr,p_downsample)
    x_tr  = x_tr/255.0
    xs_tr = xs_tr/255.0

    # big mlp
    print(x_tr.shape, y_tr.shape)
    mlp_big = MLP(x_tr.shape[1],M,y_tr.shape[1])
    mlp_big.fit(x_tr, y_tr, nb_epoch=50, verbose=0)
    err_big = np.mean(mlp_big.predict_classes(x_te,verbose=0)==np.argmax(y_te,1))

    # student mlp
    for t in tqdm.tqdm_notebook([1,2,5,10,20,50], desc='t loop'):
        for L in tqdm.tqdm_notebook([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], desc='l loop', leave=False):
            soften = theano.function([mlp_big.layers[0].input], mlp_big.layers[2].output)
            ys_tr  = softmax(soften(x_tr),t)
            print(ys_tr.shape, y_tr.shape)

            mlp_student = get_distillation(xs_tr.shape[1],M,ys_tr.shape[1],t,L)
            mlp_student.fit(xs_tr, {'hard':y_tr, 'soft':ys_tr}, nb_epoch=50, verbose=0)
            err_student = np.mean(np.argmax(mlp_student.predict({'x':xs_te})['hard'],1)==np.argmax(y_te,1))

            line = [N, p_downsample, round(err_big,3), t, L, round(err_student,3)]
            outfile.write(str(line)+'\n')
