## Imports and Classes

In [1]:
from tqdm.notebook import tqdm  # Import tqdm for Jupyter Notebook
from src.optimizee import *
from src.torch_utils import *
import shutil


from torch.utils.tensorboard import SummaryWriter
# from optimizer_concurrent import *
# from train_concurrent import *

In [2]:
class LSTMConcurrent(nn.Module):
    """
    LSTM-based optimizer as described in the paper.
    """
    def __init__(self, num_optims, hidden_size=20, preproc=True, preproc_factor=10):
        super().__init__()
        self.hidden_size = hidden_size
        self.preproc = preproc
        self.preproc_factor = torch.tensor(preproc_factor)
        self.preproc_threshold = float(torch.exp(-self.preproc_factor))
        
        self.input_size = 2*num_optims if preproc else 1*num_optims
        self.lstm = nn.LSTM(self.input_size, hidden_size, 2, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, 1)


    def forward(self, x, hidden_state):
        """
        Forward pass of the LSTM optimizer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_size).
            hidden_state (tuple): Hidden state of the LSTM (h, c).

        Returns:
            torch.Tensor: Output updates of shape (batch_size, sequence_length, 1).
            tuple: Updated hidden state.
        """
        if self.preproc: x = self.preprocess_gradients(x)
        # print("Gradients", x)
        # print("Preprocess Shape", x.shape)
        out, hidden_state = self.lstm(x, hidden_state)
        out = self.output_layer(out)

        return out, hidden_state



    def preprocess_gradients(self, gradients):
        """ Applies log transformation & sign extraction to gradients, moving to CUDA if available. """

        gradients = gradients.data  # Extract raw gradient data
        # print("Gradients Shape", gradients.shape)
        if len(gradients.size()) == 1: gradients = gradients.unsqueeze(-1)
        
        param_size = gradients.size(0)
        num_optims = gradients.size(1)

        preprocessed = torch.zeros(param_size, 2*num_optims)

        for i in range(num_optims):
            gradient = gradients[:,i]
            keep_grads = (torch.abs(gradient) >= self.preproc_threshold)
        
            # Log transformation for large gradients
        
            preprocessed[keep_grads, 2*i] = (torch.log(torch.abs(gradient[keep_grads]) + 1e-8) / self.preproc_factor)
            preprocessed[keep_grads, 2*i+1] = torch.sign(gradient[keep_grads])

            # Direct scaling for small gradients
            preprocessed[~keep_grads, 2*i] = -1
            preprocessed[~keep_grads, 2*i+1] = (float(torch.exp(self.preproc_factor)) * gradient[~keep_grads])

        # print(preprocessed.shape)
        return torch.tensor(preprocessed).to(gradients.device)
    

    def initialize_hidden_state(self):
        # Initialize hidden & cell states for LSTM (one per parameter)
        self.h0 = to_cuda(torch.zeros(2, self.hidden_size))
        # self.h0 = torch.randn(2, self.hidden_size)
        self.c0 = to_cuda(torch.zeros(2, self.hidden_size))
        # self.c0 = torch.randn(2, self.hidden_size)
        return (self.h0, self.c0)
    

In [3]:
def initialize_optimizees(optimizee_cls, optimizee_kwargs, num_optimizees=10, noise='equal'):
    optimizees = []
    for i in range(num_optimizees):
        if noise == 'equal':
            optim = optimizee_cls(**optimizee_kwargs)
            optim.train()
            optimizees.append(optim)
        else:
            optimizee_kwargs['noise_std'] = 0.01 * (i+1)
            optim = optimizee_cls(**optimizee_kwargs)
            optim.train()
            optimizees.append(optim)
    return optimizees




def train_LSTM(lstm_optimizer, meta_optimizer, optimizee_class, optimizee_kwargs, num_optimizees=1, num_epochs=500, time_horizon=200, discount=1, scheduler = None, noise='equal', writer=None):
    lstm_optimizer.train()
    if scheduler is None:
        scheduler = torch.optim.lr_scheduler.ConstantLR(meta_optimizer, factor=1.0, total_iters=num_epochs)

    
    with tqdm(range(num_epochs), desc="Training Progress") as pbar:
        for epoch in pbar:
            # Initialize optimizee parameters
            optimizees = initialize_optimizees(optimizee_class, optimizee_kwargs, num_optimizees, noise=noise)
            optimizees[0].set_params()
            params = optimizees[0].all_parameters()
            print("Param Shape", params.shape)

            # hidden_state = lstm_optimizer.initialize_hidden_state()

            cumulative_loss = None             
            for t in range(time_horizon):
                gradients = []
                hidden_state = lstm_optimizer.initialize_hidden_state()
                for i in range(num_optimizees):
                    optimizee = optimizees[i]
                    loss, grad_params = optimizee.compute_loss(params, return_grad=True)
                    if i == 0 and discount: cumulative_loss = loss*discount**(time_horizon-1) if cumulative_loss is None else cumulative_loss + loss*discount**(time_horizon-t-1)
                    elif i==0: cumulative_loss = loss
                    gradients.append(grad_params.squeeze())
                    if writer and i==0 and epoch==1: writer.add_scalar("Grad", grad_params.squeeze().mean(), t)

                # Stack gradients
                grad_params = torch.stack(gradients).T
                # print("Grads", grad_params.shape)
                update, hidden_state = lstm_optimizer(grad_params, hidden_state)
                # print("Update", update.shape)
                # print("Params", params.shape)
                # with torch.no_grad():
                params = params + update
                if writer and epoch==1: writer.add_scalar("Update", update.mean(), t)
                # print("Update", update)
                optimizees[0].set_params(params)

            
            print("Cumulative Loss", cumulative_loss)
            # Backpropagation through time (BPTT)
            if writer: writer.add_scalar("Loss", cumulative_loss, epoch)
            meta_optimizer.zero_grad()
            cumulative_loss.backward()
            # torch.nn.utils.clip_grad_norm_(lstm_optimizer.parameters(), 1)
            # Print gradients
            print("Gradients", lstm_optimizer.lstm.weight_ih_l0.grad)
            print("Gradients", lstm_optimizer.lstm.weight_hh_l0.grad)
            print("Gradients", lstm_optimizer.output_layer.weight.grad)
            meta_optimizer.step()
            scheduler.step()

            # Update progress bar
            pbar.set_postfix(loss=cumulative_loss.item())
            if (epoch + 1) % 1 == 0:
                current_lr = meta_optimizer.param_groups[0]['lr']
                print(f"Epoch [{epoch+1}/{num_epochs}], Cumulative Loss: {cumulative_loss.item():.4f}, LR: {current_lr:.3e}")
                print(f"Final parameters: {params.detach().numpy().T}")
                
    print("\nTraining complete!")
    return lstm_optimizer




def test_LSTM(lstm_optimizer, optimizee_class, optimizee_kwargs, num_optimizees=1, time_horizon=200, noise='equal', writer=None):
    lstm_optimizer.eval()
    optimizees = initialize_optimizees(optimizee_class, optimizee_kwargs, num_optimizees, noise=noise)    
    optimizees[0].set_params()
    params = optimizees[0].all_parameters()
    hidden_state = lstm_optimizer.initialize_hidden_state()
    for t in range(time_horizon):
        gradients = []
        for i in range(num_optimizees):
            optimizee = optimizees[i]
            loss, grad_params = optimizee.compute_loss(params)
            if writer and i==0: writer.add_scalar("Loss", loss, t)
            gradients.append(grad_params.squeeze())
        
        grad_params = torch.stack(gradients).T
        # if len(grad_params.shape)==1: grad_params = grad_params.unsqueeze(-1)

        # print(grad_params.shape)
        updates, hidden_state = lstm_optimizer(grad_params, hidden_state)
        params = params - updates 
        optimizees[0].set_params(params)

    # print(f"Final parameters: {params.detach().numpy().T}")
    return params

## Quadratic Optimizee

In [61]:
class QuadraticOptimizee(Optimizee):
    """
    Class for the quadratic function described in the paper.
    """
    def __init__(self, W, theta0, noise_std=0.01):
        """
        Initialize the quadratic function.

        Args:
            W (np.ndarray): 10x10 matrix.
            theta0 (np.ndarray): 10x1 vector (true parameters).
            noise_std (float): Standard deviation of the noise term.
        """
        super().__init__()
        self.W = torch.tensor(W, dtype=torch.float32)
        self.theta0 = torch.tensor(theta0, dtype=torch.float32)
        self.noise_std = noise_std
        self.theta = None

        # Generate noisy observations y = W @ theta0 + eps
        self.y = self.W @ self.theta0 + self.noise_std * torch.randn_like(self.theta0)

    def set_params(self, params=None):
        """
        Returns initial parameters for optimization (random initialization).
        """
        self.theta = torch.randn_like(self.theta0, requires_grad=True) if params is None else params

    def compute_loss(self, params, return_grad=True):
        """
        Computes the loss ||W @ params - y||^2.
        """
        if return_grad:
            loss = torch.norm((self.W.matmul(params) - self.y) ** 2)
            grads = torch.autograd.grad(loss, params, create_graph=True)[0]
            detached_grads = torch.tensor(grads.detach().numpy(), requires_grad=True)
            return loss, detached_grads
        else:
            return torch.norm((self.W.matmul(params) - self.y) ** 2)
    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        return self.theta
    
    # Implement train and eval:
    def train(self):
        pass
    
    def eval(self):
        pass

In [62]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
# print("W", W)

lstm_optimizer = LSTMConcurrent(num_optims=1)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
writer = SummaryWriter("test")
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=50, time_horizon=500, discount=0.9, writer=writer)
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000)

Training Progress:   0%|          | 0/50 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Cumulative Loss tensor(1262458.7500, grad_fn=<AddBackward0>)
Gradients tensor([[ 2.3469e+04,  4.1002e+04],
        [-1.1192e+04, -2.1484e+04],
        [ 2.1924e+04,  3.3216e+04],
        [ 7.9870e+03,  1.2955e+04],
        [-3.5454e+03, -6.4426e+03],
        [-7.2477e+03, -1.8500e+04],
        [-2.3202e+04, -4.1242e+04],
        [ 3.6760e+04,  5.7413e+04],
        [-1.4673e+04, -1.6919e+04],
        [-1.3279e+04, -2.5857e+04],
        [ 1.8897e+04,  4.1900e+04],
        [-3.0597e+04, -4.0078e+04],
        [ 9.9187e+03,  1.3222e+04],
        [ 3.9425e+03,  9.3196e+03],
        [ 1.2765e+04,  1.6516e+04],
        [-1.7572e+03, -5.6349e+03],
        [ 3.9869e+03,  8.3535e+03],
        [ 1.1893e+04,  3.5488e+04],
        [ 3.6488e+03,  5.5316e+03],
        [ 8.4787e+02,  1.2364e+03],
        [ 1.3055e+04,  1.8349e+04],
        [-5.8825e+03, -7.4623e+03],
        [ 1.2662e+04,  1.6251e+04],
        [ 5.9425e+03,  8.1076e+03],
        [-2.0189e+03, -2.9820e+03],
        [-3.1815e+03, -4.3911

KeyboardInterrupt: 

In [104]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0.9") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0.9")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])
Update tensor([[0.2111],
        [0.2268],
        [0.1715],
        [0.1267],
        [0.0972],
        [0.0857],
        [0.0737],
        [0.0714],
        [0.0653],
        [0.0597]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0562],
        [0.0547],
        [0.0540],
        [0.0536],
        [0.0535],
        [0.0595],
        [0.0585],
        [0.0623],
        [0.0598],
        [0.0565]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0544],
        [0.0537],
        [0.0534],
        [0.0533],
        [0.0533],
        [0.0594],
        [0.0584],
        [0.0622],
        [0.0597],
        [0.0565]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0544],
        [0.0538],
        [0.0535],
        [0.0533],
        [0.0534],
        [0.0594],
        [0.0584],
        [0.0622],
        [0.0597],
        [0.0565]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0545],
        [0.0538],
        [0.0536],
        [0.0534],
        [0.0534],
        [0.0594

  return torch.tensor(preprocessed).to(gradients.device)


tensor([[0.0629],
        [0.0638],
        [0.0642],
        [0.0644],
        [0.0645],
        [0.0644],
        [0.0643],
        [0.0586],
        [0.0594],
        [0.0615]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0630],
        [0.0638],
        [0.0642],
        [0.0644],
        [0.0645],
        [0.0644],
        [0.0643],
        [0.0586],
        [0.0594],
        [0.0615]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0630],
        [0.0638],
        [0.0642],
        [0.0644],
        [0.0645],
        [0.0644],
        [0.0643],
        [0.0586],
        [0.0594],
        [0.0615]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0630],
        [0.0638],
        [0.0643],
        [0.0645],
        [0.0645],
        [0.0644],
        [0.0644],
        [0.0586],
        [0.0594],
        [0.0615]], grad_fn=<AddmmBackward0>)
Update tensor([[0.0630],
        [0.0638],
        [0.0643],
        [0.0645],
        [0.0645],
        [0.0645],
        [0.0644],
        [0.0586],


KeyboardInterrupt: 

In [66]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0.1") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.1, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0.1")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 86188.9375, LR: 1.000e-02
Final parameters: [[31.855953 33.48319  33.22974  32.58021  31.841679 31.375471 32.790844
  29.157654 30.33128  30.265963]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 35799.2578, LR: 1.000e-02
Final parameters: [[-20.349451 -20.300594 -19.493969 -18.321798 -18.756596 -18.079155
  -18.142967 -20.111877 -19.823696 -19.606356]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 10634.2646, LR: 1.000e-02
Final parameters: [[-10.789026  -11.57954   -11.096596   -8.865726   -7.746987   -9.454046
   -7.1986303 -10.252164  -14.621588  -12.600348 ]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 20.3428, LR: 1.000e-02
Final parameters: [[ 0.47475934  2.2697022   2.1017098  -0.04158057 -1.129506   -1.1536789
   1.8131113   0.4926201  -0.9938171   0.5147538 ]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 0.1866, LR: 1.000e-02
Final parameters: [[0.8504492  0.83243    1.4

KeyboardInterrupt: 

In [9]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0_e3") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=None, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0_e3")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 232685.3281, LR: 1.000e-03
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 153203.7188, LR: 1.000e-03
Final parameters: [[-47.433903 -42.08653  -42.588608 -41.05655  -37.881584 -36.20875
  -42.903183 -41.539223 -48.43857  -48.30403 ]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 81233.2109, LR: 1.000e-03
Final parameters: [[-35.26702  -31.748522 -29.824865 -28.165958 -25.296488 -26.418747
  -30.976404 -31.502092 -38.265526 -37.463898]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 32433.4551, LR: 1.000e-03
Final parameters: [[-23.400938 -19.127851 -17.509352 -14.425954 -13.55961  -14.45078
  -21.089912 -20.442644 -26.89849  -25.322037]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 3792.1633, LR: 1.000e-03
Final parameters: [[-10.182979   -6.414855   -3

In [10]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0_e4") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.0001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=None, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0_e4")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 232685.3281, LR: 1.000e-04
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 233961.1406, LR: 1.000e-04
Final parameters: [[-57.65584  -52.655945 -53.384445 -51.997425 -48.91757  -45.666992
  -52.54768  -49.802753 -57.337666 -58.01194 ]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 220886.4531, LR: 1.000e-04
Final parameters: [[-55.829365 -52.958355 -51.44749  -50.05098  -47.349697 -45.452236
  -50.483547 -48.312386 -56.304104 -57.05438 ]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 211067.3594, LR: 1.000e-04
Final parameters: [[-54.646168 -51.203163 -50.09778  -47.298798 -46.586906 -43.33314
  -51.113052 -46.601032 -54.7307   -55.285213]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 203845.0781, LR: 1.000e-04
Final parameters: [[-54.228737 -50.82611  

## Linear Optimizee

In [None]:
class LinearNNOptimizee(Optimizee):
    """
    Class for a generic linear neural network optimizee.
    """
    def __init__(self, dataloader, input_size, hidden_size, output_size):
        """
        """
        super().__init__()
        self.dataloader = dataloader
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.model = nn.Sequential(nn.Linear(input_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, output_size))
        self.num_params = sum(p.numel() for p in self.model.parameters())
    
    
    def set_params(self, params=None):
        """
        Given a tensor of shape (d,1), sets the parameters of the optimizee.
        """
        if params is not None:
            params = torch.flatten(params)
            i = 0
            for param in self.model.parameters():
                param_size = param.numel()
                param.data = params[i:i + param_size].view_as(param)
                i += param_size


    def compute_loss(self, params, num_samples=10, return_grad=True):
        self.set_params(params)  # Set model parameters
        total_loss = None
        dataloader_iter = iter(self.dataloader)
        for _ in range(num_samples):
            try: inputs, targets = next(dataloader_iter)  # Get a batch
            except StopIteration: dataloader_iter = iter(self.dataloader); inputs, targets = next(dataloader_iter)
            outputs = self.model(inputs.flatten())  # Forward pass
            targets_oh = torch.zeros_like(outputs)
            targets_oh[targets] = 1
            loss = torch.norm((outputs - targets_oh) ** 2)
            total_loss = loss if total_loss is None else total_loss + loss
        total_loss = total_loss / num_samples

        if return_grad:
            grads = torch.autograd.grad(total_loss, self.model.parameters(), create_graph=True)
            # grads = torch.autograd.grad(total_loss, self.model.parameters(), grad_outputs=torch.ones_like(total_loss), retain_graph=True, create_graph=True)
            grads = torch.cat([g.flatten() for g in grads]).unsqueeze(-1)
            return total_loss, grads.detach()
        else:
            return total_loss

    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        return torch.cat([p.flatten() for p in self.model.parameters()]).unsqueeze(-1)
    

In [67]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define a transform (convert images to tensors and normalize)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Download and load the training dataset
train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

dataloader_iter = iter(train_loader)
inputs, targets = next(dataloader_iter)
print("Input Shape", inputs.shape)
print("Target Shape", targets.shape)

Input Shape torch.Size([1, 1, 28, 28])
Target Shape torch.Size([1])


In [68]:
kwargs = {"dataloader": train_loader, "input_size": 784, "hidden_size": 20, "output_size": 10}

lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=True)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
# lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, LinearNNOptimizee, kwargs, num_epochs=10, time_horizon=200, discount=1e-3)
# params = test_LSTM(lstm_optimizer, LinearNNOptimizee, kwargs, time_horizon=100)

## Intermediate Optimizee

In [4]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
y = (y-np.mean(y))/np.std(y)
X.shape

(442, 10)

In [5]:
class MLOptimizee(Optimizee):
    """
    Class for a generic linear neural network optimizee.
    """
    def __init__(self, X, y):
        """
        """
        super().__init__()
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.input_size = X.shape[1]
        try: self.output_size = y.shape[1]
        except: self.output_size = 1
        self.model = nn.Linear(self.input_size, self.output_size)
        # self.num_params = sum(p.numel() for p in self.model.parameters())
    
    
    def set_params(self, params=None):
        """
        Given a tensor of shape (d,1), sets the parameters of the optimizee.
        """
        if params is not None:
            params = params.squeeze(-1)
            # params.requires_grad = True
            # print("Params_0", torch.nn.utils.parameters_to_vector(self.model.parameters()))
            torch.nn.utils.vector_to_parameters(params, self.model.parameters())
            # print("Params_1", torch.nn.utils.parameters_to_vector(self.model.parameters()))

    def compute_loss(self, params, return_grad=True):
        self.set_params(params)  # Set model parameters
        outputs = self.model(self.X)
        # print("Outputs", outputs[:10])
        loss = torch.norm((outputs - self.y))/len(self.X)
        # print(loss)
        
        if return_grad:
            grads = torch.autograd.grad(loss, self.model.parameters(), create_graph=True)
            grads = torch.cat([g.flatten() for g in grads]).unsqueeze(-1)
            detached_grads = torch.tensor(grads.data, requires_grad=True)
            # print("Detached Grads", detached_grads)
            return loss, detached_grads
        return loss

    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        params = self.model.parameters()
        param_vector = torch.nn.utils.parameters_to_vector(params)
        # print("Param Vector", param_vector.shape)
        return param_vector.unsqueeze(-1)
    
    def train(self):
        self.model.train()
    
    def eval(self):
        self.model.eval()

In [6]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n,1)
b = np.random.randn(1)
params = torch.cat([torch.tensor(p, requires_grad=True, dtype=torch.float32).flatten() for p in [W, b]]).unsqueeze(-1)
print("Params", params.shape)

optimizee = MLOptimizee(X, y)
optimizee.set_params(params)
print(optimizee.all_parameters().shape)
l_a,grad_a = optimizee.compute_loss(params, return_grad=True)

print("Loss", l_a)
print("Grad", grad_a)

torch.manual_seed(0)
np.random.seed(0)
lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=True)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
h = lstm_optimizer.initialize_hidden_state()
update, h = lstm_optimizer(grad_a, h)
print("Update", update)


Params torch.Size([11, 1])
torch.Size([11, 1])
Loss tensor(1.0315, grad_fn=<DivBackward0>)
Grad tensor([[0.0066],
        [0.0020],
        [0.0047],
        [0.0077],
        [0.0050],
        [0.0035],
        [0.0005],
        [0.0024],
        [0.0047],
        [0.0049],
        [0.1396]], requires_grad=True)


  detached_grads = torch.tensor(grads.data, requires_grad=True)


Update tensor([[0.1088],
        [0.0961],
        [0.0869],
        [0.0799],
        [0.0746],
        [0.0706],
        [0.0676],
        [0.0651],
        [0.0630],
        [0.0616],
        [0.0603]], grad_fn=<AddmmBackward0>)


  return torch.tensor(preprocessed).to(gradients.device)


In [42]:
# Delete folders if present
try:
    shutil.rmtree("diabetes_logs_train")
    shutil.rmtree("diabetes_logs")
except:
    pass

kwargs = {"X": X, "y": y}

torch.manual_seed(0)
np.random.seed(0)
lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=True)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.1)
# meta_optimizer = optim.SGD(lstm_optimizer.parameters(), lr=0.1)
writer = SummaryWriter("diabetes_logs_train")
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, MLOptimizee, kwargs, num_epochs=10, time_horizon=1000, discount=0.9, writer=writer)
writer = SummaryWriter("diabetes_logs")
params = test_LSTM(lstm_optimizer, MLOptimizee, kwargs, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Param Shape torch.Size([11, 1])


  detached_grads = torch.tensor(grads.data, requires_grad=True)
  return torch.tensor(preprocessed).to(gradients.device)


Cumulative Loss tensor(638.8634, grad_fn=<AddBackward0>)
Gradients None
Gradients None
Gradients None
Epoch [1/10], Cumulative Loss: 638.8634, LR: 1.000e-01
Final parameters: [[108.90109   95.8711    86.76418   79.97159   74.91464   70.7601
   63.504707  62.144447  62.54211   62.329365  61.46543 ]]
Param Shape torch.Size([11, 1])


KeyboardInterrupt: 

## Intermediate But Explicit

In [7]:
class MLOptimizee2(Optimizee):
    """
    Class for a generic linear neural network optimizee.
    """
    def __init__(self, X, y):
        """
        """
        super().__init__()
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.input_size = X.shape[1]
        try: self.output_size = y.shape[1]
        except: self.output_size = 1
        self.W = torch.randn(self.input_size, self.output_size, requires_grad=True)
        self.b = torch.randn(self.output_size, requires_grad=True)
        # self.num_params = sum(p.numel() for p in self.model.parameters())
    
    
    def set_params(self, params=None):
        """
        Given a tensor of shape (d,1), sets the parameters of the optimizee.
        """
        if params is not None:
            params = params.squeeze(-1)
            # print("Params", params.shape)
            # print("Input Size", self.input_size)
            # print("Output Size", self.output_size)
            self.W = params[:self.input_size*self.output_size].view(self.input_size, self.output_size)
            self.b = params[self.input_size*self.output_size:].view(self.output_size)

    def compute_loss(self, params, return_grad=True):
        self.set_params(params)  # Set model parameters
        outputs = self.X @ self.W + self.b
        # print("Outputs", outputs[:10])
        loss = torch.norm((outputs - self.y))/len(self.X)
        # print(loss)
        
        if return_grad:
            grads = torch.autograd.grad(loss, [self.W, self.b], create_graph=True)
            # grads = torch.autograd.grad(loss, self.model.parameters(), create_graph=True)
            grads = torch.cat([g.flatten() for g in grads]).unsqueeze(-1)
            detached_grads = torch.tensor(grads.data, requires_grad=True)
            # print("Detached Grads", detached_grads)
            return loss, detached_grads
        return loss

    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        params = [self.W, self.b]
        param_vector = torch.cat([p.flatten() for p in params])
        # print("Param Vector", param_vector.shape)
        return param_vector.unsqueeze(-1)
    
    def train(self):
        pass

    def eval(self):
        pass

In [11]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n,1)
b = np.random.randn(1)
params = torch.cat([torch.tensor(p, requires_grad=True, dtype=torch.float32).flatten() for p in [W, b]]).unsqueeze(-1)
print("Params", params.shape)

optimizee = MLOptimizee2(X, y)
optimizee.set_params(params)
l_b,grad_b = optimizee.compute_loss(params, return_grad=True)

print("Loss", l_b)
print("Grad", grad_b)

print("Loss Equality:", l_a==l_b)
print("Grad Equality:", all(grad_a==grad_b))

torch.manual_seed(0)
np.random.seed(0)
lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=True)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
h = lstm_optimizer.initialize_hidden_state()
update, h = lstm_optimizer(grad_b, h)
print("Update", update)


Params torch.Size([11, 1])
Loss tensor(1.0315, grad_fn=<DivBackward0>)
Grad tensor([[0.0066],
        [0.0020],
        [0.0047],
        [0.0077],
        [0.0050],
        [0.0035],
        [0.0005],
        [0.0024],
        [0.0047],
        [0.0049],
        [0.1396]], requires_grad=True)
Loss Equality: tensor(True)
Grad Equality: True
Update tensor([[0.1088],
        [0.0961],
        [0.0869],
        [0.0799],
        [0.0746],
        [0.0706],
        [0.0676],
        [0.0651],
        [0.0630],
        [0.0616],
        [0.0603]], grad_fn=<AddmmBackward0>)


  detached_grads = torch.tensor(grads.data, requires_grad=True)
  return torch.tensor(preprocessed).to(gradients.device)


In [45]:
# Delete folders if present
try:
    shutil.rmtree("diabetes_logs_train")
    shutil.rmtree("diabetes_logs")
except:
    pass

kwargs = {"X": X, "y": y}

torch.manual_seed(0)
np.random.seed(0)
lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=True)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.1)
# meta_optimizer = optim.SGD(lstm_optimizer.parameters(), lr=0.1)
writer = SummaryWriter("diabetes_logs_train")
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, MLOptimizee2, kwargs, num_epochs=10, time_horizon=1000, discount=0.9, writer=writer)
writer = SummaryWriter("diabetes_logs")
params = test_LSTM(lstm_optimizer, MLOptimizee2, kwargs, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Param Shape torch.Size([11, 1])


  detached_grads = torch.tensor(grads.data, requires_grad=True)
  return torch.tensor(preprocessed).to(gradients.device)


Cumulative Loss tensor(638.8801, grad_fn=<AddBackward0>)
Gradients tensor([[-8.2694e+00,  2.2519e+01],
        [ 1.2429e+01, -2.6161e+01],
        [-1.1729e+01,  2.2701e+01],
        [ 2.0954e+00, -1.0450e+00],
        [ 2.3422e+00, -6.4323e+00],
        [-5.8192e+00,  1.9537e+00],
        [-3.5288e-01, -7.0682e+00],
        [-7.2662e+00,  1.7907e+01],
        [ 4.9376e+00, -8.6743e+00],
        [ 1.5881e+00, -8.2688e+00],
        [ 9.4228e+00, -7.1600e+00],
        [ 2.1644e+01, -3.4812e+01],
        [-9.0719e+00,  1.5027e+01],
        [ 2.5435e+00, -4.7904e-01],
        [-2.5219e+00,  3.9381e+00],
        [ 1.2774e+00, -4.3193e+00],
        [ 1.6063e+00, -2.0071e+00],
        [-9.8706e+00,  2.9679e+01],
        [ 7.5756e-01,  1.4232e+00],
        [-1.4419e-02, -4.1741e-01],
        [-7.5414e+00,  1.9022e+01],
        [ 1.0789e+01, -1.3917e+01],
        [-1.1831e+01,  2.0995e+01],
        [ 2.1610e+00,  1.2172e+00],
        [ 2.1705e+00, -4.8427e+00],
        [-4.3608e+00,  8.1299e+00

KeyboardInterrupt: 