## Imports and Classes

In [None]:
from tqdm.notebook import tqdm  # Import tqdm for Jupyter Notebook
from src.optimizee import *
from src.torch_utils import *
import shutil


from torch.utils.tensorboard import SummaryWriter
# from optimizer_concurrent import *
# from train_concurrent import *

In [2]:
class LSTMConcurrent(nn.Module):
    """
    LSTM-based optimizer as described in the paper.
    """
    def __init__(self, num_optims, hidden_size=20, preproc=True, preproc_factor=10):
        super().__init__()
        self.hidden_size = hidden_size
        self.preproc = preproc
        self.preproc_factor = torch.tensor(preproc_factor)
        self.preproc_threshold = float(torch.exp(self.preproc_factor))
        
        self.input_size = 2*num_optims if preproc else 1*num_optims
        self.lstm = nn.LSTM(self.input_size, hidden_size, 2, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, 1)


    def forward(self, x, hidden_state):
        """
        Forward pass of the LSTM optimizer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_size).
            hidden_state (tuple): Hidden state of the LSTM (h, c).

        Returns:
            torch.Tensor: Output updates of shape (batch_size, sequence_length, 1).
            tuple: Updated hidden state.
        """
        if self.preproc: x = self.preprocess_gradients(x)

        # print("Preprocess Shape", x.shape)
        out, hidden_state = self.lstm(x, hidden_state)
        out = self.output_layer(out)

        return out, hidden_state



    def preprocess_gradients(self, gradients):
        """ Applies log transformation & sign extraction to gradients, moving to CUDA if available. """

        gradients = gradients.data  # Extract raw gradient data
        # print("Gradients Shape", gradients.shape)
        if len(gradients.size()) != 2: raise Exception
        
        param_size = gradients.size(0)
        num_optims = gradients.size(1)

        preprocessed = torch.zeros(param_size, 2*num_optims)

        for i in range(num_optims):
            gradient = gradients[:,i]
            keep_grads = (torch.abs(gradient) >= self.preproc_threshold)
        
            # Log transformation for large gradients
        
            preprocessed[keep_grads, 2*i] = (torch.log(torch.abs(gradient[keep_grads]) + 1e-8) / self.preproc_factor)
            preprocessed[keep_grads, 2*i+1] = torch.sign(gradient[keep_grads])

            # Direct scaling for small gradients
            preprocessed[~keep_grads, 2*i] = -1
            preprocessed[~keep_grads, 2*i+1] = (float(torch.exp(self.preproc_factor)) * gradient[~keep_grads])

        # print(preprocessed.shape)
        return torch.tensor(preprocessed).to(gradients.device)
    

    def initialize_hidden_state(self):
        # Initialize hidden & cell states for LSTM (one per parameter)
        self.h0 = to_cuda(torch.zeros(2, self.hidden_size))
        self.c0 = to_cuda(torch.zeros(2, self.hidden_size))
        return (self.h0, self.c0)
    

In [7]:
def initialize_optimizees(optimizee_cls, optimizee_kwargs, num_optimizees=10, noise='equal'):
    optimizees = []
    for i in range(num_optimizees):
        if noise == 'equal':
            optimizees.append(optimizee_cls(**optimizee_kwargs))
        else:
            optimizee_kwargs['noise_std'] = 0.01 * (i+1)
            optimizees.append(optimizee_cls(**optimizee_kwargs))
    return optimizees




def train_LSTM(lstm_optimizer, meta_optimizer, optimizee_class, optimizee_kwargs, num_optimizees=1, num_epochs=500, time_horizon=200, discount=1, scheduler = None, noise='equal', writer=None):
    
    if scheduler is None:
        scheduler = torch.optim.lr_scheduler.ConstantLR(meta_optimizer, factor=1.0, total_iters=num_epochs)

    
    with tqdm(range(num_epochs), desc="Training Progress") as pbar:
        for epoch in pbar:
            # Initialize optimizee parameters
            optimizees = initialize_optimizees(optimizee_class, optimizee_kwargs, num_optimizees, noise=noise)
            optimizees[0].set_params()
            params = optimizees[0].all_parameters()
            print("Param Shape", params.shape)

            hidden_state = lstm_optimizer.initialize_hidden_state()

            cumulative_loss = None             
            for t in range(time_horizon):
                gradients = []
                for i in range(num_optimizees):
                    optimizee = optimizees[i]
                    loss, grad_params = optimizee.compute_loss(params, return_grad=True)
                    if i == 0 and discount: cumulative_loss = loss*discount**(time_horizon-1) if cumulative_loss is None else cumulative_loss + loss*discount**(time_horizon-t-1)
                    elif i==0: cumulative_loss = loss
                    gradients.append(grad_params.squeeze())
                    if writer and i==0 and epoch==1: writer.add_scalar("Grad", grad_params.squeeze().mean(), t)

                # Stack gradients
                grad_params = torch.stack(gradients).T
                # print("Grads", grad_params.shape)
                update, hidden_state = lstm_optimizer(grad_params, hidden_state)
                # print("Update", update.shape)
                # print("Params", params.shape)
                params = params - update
                if writer and epoch==1: writer.add_scalar("Update", update.mean(), t)
                # print("Update", update)
                optimizees[0].set_params(params)

                
            # Backpropagation through time (BPTT)
            if writer: writer.add_scalar("Loss", cumulative_loss, epoch)
            meta_optimizer.zero_grad()
            cumulative_loss.backward()
            torch.nn.utils.clip_grad_norm_(lstm_optimizer.parameters(), 1)
            meta_optimizer.step()
            scheduler.step()

            # Update progress bar
            pbar.set_postfix(loss=cumulative_loss.item())
            if (epoch + 1) % 1 == 0:
                current_lr = meta_optimizer.param_groups[0]['lr']
                print(f"Epoch [{epoch+1}/{num_epochs}], Cumulative Loss: {cumulative_loss.item():.4f}, LR: {current_lr:.3e}")
                print(f"Final parameters: {params.detach().numpy().T}")
                
    print("\nTraining complete!")
    return lstm_optimizer




def test_LSTM(lstm_optimizer, optimizee_class, optimizee_kwargs, num_optimizees=1, time_horizon=200, noise='equal', writer=None):
    optimizees = initialize_optimizees(optimizee_class, optimizee_kwargs, num_optimizees, noise=noise)    
    optimizees[0].set_params()
    params = optimizees[0].all_parameters()
    hidden_state = lstm_optimizer.initialize_hidden_state()
    for t in range(time_horizon):
        gradients = []
        for i in range(num_optimizees):
            optimizee = optimizees[i]
            loss, grad_params = optimizee.compute_loss(params)
            if writer and i==0: writer.add_scalar("Loss", loss, t)
            gradients.append(grad_params.squeeze())
        
        grad_params = torch.stack(gradients).T
        # if len(grad_params.shape)==1: grad_params = grad_params.unsqueeze(-1)

        # print(grad_params.shape)
        updates, hidden_state = lstm_optimizer(grad_params, hidden_state)
        params = params - updates 
        optimizees[0].set_params(params)

    # print(f"Final parameters: {params.detach().numpy().T}")
    return params

## Quadratic Optimizee

In [4]:
class QuadraticOptimizee(Optimizee):
    """
    Class for the quadratic function described in the paper.
    """
    def __init__(self, W, theta0, noise_std=0.01):
        """
        Initialize the quadratic function.

        Args:
            W (np.ndarray): 10x10 matrix.
            theta0 (np.ndarray): 10x1 vector (true parameters).
            noise_std (float): Standard deviation of the noise term.
        """
        super().__init__()
        self.W = torch.tensor(W, dtype=torch.float32)
        self.theta0 = torch.tensor(theta0, dtype=torch.float32)
        self.noise_std = noise_std
        self.theta = None

        # Generate noisy observations y = W @ theta0 + eps
        self.y = self.W @ self.theta0 + self.noise_std * torch.randn_like(self.theta0)

    def set_params(self, params=None):
        """
        Returns initial parameters for optimization (random initialization).
        """
        self.theta = torch.randn_like(self.theta0, requires_grad=True) if params is None else params

    def compute_loss(self, params, return_grad=True):
        """
        Computes the loss ||W @ params - y||^2.
        """
        if return_grad:
            loss = torch.norm((self.W.matmul(params) - self.y) ** 2)
            grads = torch.autograd.grad(loss, params, create_graph=True)[0]
            detached_grads = torch.tensor(grads.detach().numpy(), requires_grad=True)
            return loss, detached_grads
        else:
            return torch.norm((self.W.matmul(params) - self.y) ** 2)
    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        return self.theta
    

In [13]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
# print("W", W)

lstm_optimizer = LSTMConcurrent(num_optims=1)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
writer = SummaryWriter("test")
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=10, time_horizon=500, discount=0.9, writer=writer)
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000)

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/10], Cumulative Loss: 2244350.7500, LR: 1.000e-02
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/10], Cumulative Loss: 563.6805, LR: 1.000e-02
Final parameters: [[ 3.7887573   7.739546   -3.7872036   0.74397314  5.0226865   1.7387401
  -2.2683296  -0.7724253   1.7928505  -0.8844478 ]]
Param Shape torch.Size([10, 1])
Epoch [3/10], Cumulative Loss: 1.7231, LR: 1.000e-02
Final parameters: [[1.0834036 1.0638868 1.1475857 1.0236803 1.1605811 1.1036468 1.0133401
  1.1399117 1.1107659 0.9608207]]
Param Shape torch.Size([10, 1])
Epoch [4/10], Cumulative Loss: 1.6417, LR: 1.000e-02
Final parameters: [[0.92498726 1.7654617  0.61624473 1.0919236  0.54451215 0.58068657
  1.0665014  0.54487956 0.6673346  1.5165486 ]]
Param Shape torch.Size([10, 1])
Epoch [5/10], Cumulative Loss: 3.2866, LR: 1.000e-02
Final parameters: [[0.90379417 1.3082023  1.1404821  0.9865482  

In [5]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0.9") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0.9")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 2244350.7500, LR: 1.000e-03
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 1479267.8750, LR: 1.000e-03
Final parameters: [[-47.434025 -42.086567 -42.5886   -41.056126 -37.881405 -36.208508
  -42.903175 -41.53932  -48.438717 -48.304802]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 784235.3125, LR: 1.000e-03
Final parameters: [[-35.26762  -31.748646 -29.824715 -28.165329 -25.296177 -26.41828
  -30.976332 -31.502777 -38.266083 -37.464523]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 313064.4375, LR: 1.000e-03
Final parameters: [[-23.401495  -19.12783   -17.508995  -14.4254465 -13.558993  -14.450282
  -21.089546  -20.442923  -26.898672  -25.32268  ]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 37149.3320, LR: 1.000e-03
Final parameters: [[-10.183179  

In [6]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0.1") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.1, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0.1")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 258424.0156, LR: 1.000e-03
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 170152.6875, LR: 1.000e-03
Final parameters: [[-47.433903 -42.08653  -42.588608 -41.05655  -37.881584 -36.20875
  -42.903183 -41.539223 -48.43857  -48.30403 ]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 90219.8984, LR: 1.000e-03
Final parameters: [[-35.26702  -31.74852  -29.824865 -28.165958 -25.296488 -26.418747
  -30.976404 -31.502094 -38.265526 -37.463894]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 36021.4375, LR: 1.000e-03
Final parameters: [[-23.40094  -19.127851 -17.509352 -14.425954 -13.559609 -14.45078
  -21.089912 -20.442686 -26.898489 -25.322037]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 4212.4478, LR: 1.000e-03
Final parameters: [[-10.18298     -6.414854    

In [9]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0_e3") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=None, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0_e3")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 232685.3281, LR: 1.000e-03
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 153203.7188, LR: 1.000e-03
Final parameters: [[-47.433903 -42.08653  -42.588608 -41.05655  -37.881584 -36.20875
  -42.903183 -41.539223 -48.43857  -48.30403 ]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 81233.2109, LR: 1.000e-03
Final parameters: [[-35.26702  -31.748522 -29.824865 -28.165958 -25.296488 -26.418747
  -30.976404 -31.502092 -38.265526 -37.463898]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 32433.4551, LR: 1.000e-03
Final parameters: [[-23.400938 -19.127851 -17.509352 -14.425954 -13.55961  -14.45078
  -21.089912 -20.442644 -26.89849  -25.322037]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 3792.1633, LR: 1.000e-03
Final parameters: [[-10.182979   -6.414855   -3

In [10]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_gamma_0_e4") 
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.0001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=None, writer=writer)
writer = SummaryWriter("runs/LSTMC_gamma_0_e4")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Param Shape torch.Size([10, 1])


  return torch.tensor(preprocessed).to(gradients.device)


Epoch [1/500], Cumulative Loss: 232685.3281, LR: 1.000e-04
Final parameters: [[-56.50897  -53.47805  -52.255253 -51.560833 -51.361187 -45.479355
  -51.359352 -51.721733 -57.24269  -58.77405 ]]
Param Shape torch.Size([10, 1])
Epoch [2/500], Cumulative Loss: 233961.1406, LR: 1.000e-04
Final parameters: [[-57.65584  -52.655945 -53.384445 -51.997425 -48.91757  -45.666992
  -52.54768  -49.802753 -57.337666 -58.01194 ]]
Param Shape torch.Size([10, 1])
Epoch [3/500], Cumulative Loss: 220886.4531, LR: 1.000e-04
Final parameters: [[-55.829365 -52.958355 -51.44749  -50.05098  -47.349697 -45.452236
  -50.483547 -48.312386 -56.304104 -57.05438 ]]
Param Shape torch.Size([10, 1])
Epoch [4/500], Cumulative Loss: 211067.3594, LR: 1.000e-04
Final parameters: [[-54.646168 -51.203163 -50.09778  -47.298798 -46.586906 -43.33314
  -51.113052 -46.601032 -54.7307   -55.285213]]
Param Shape torch.Size([10, 1])
Epoch [5/500], Cumulative Loss: 203845.0781, LR: 1.000e-04
Final parameters: [[-54.228737 -50.82611  

## Linear Optimizee

In [66]:
class LinearNNOptimizee(Optimizee):
    """
    Class for a generic linear neural network optimizee.
    """
    def __init__(self, dataloader, input_size, hidden_size, output_size):
        """
        """
        super().__init__()
        self.dataloader = dataloader
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.model = nn.Sequential(nn.Linear(input_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, output_size))
        self.num_params = sum(p.numel() for p in self.model.parameters())
    
    
    def set_params(self, params=None):
        """
        Given a tensor of shape (d,1), sets the parameters of the optimizee.
        """
        if params is not None:
            params = torch.flatten(params)
            i = 0
            for param in self.model.parameters():
                param_size = param.numel()
                param.data = params[i:i + param_size].view_as(param)
                i += param_size


    def compute_loss(self, params, num_samples=10, return_grad=True):
        self.set_params(params)  # Set model parameters
        total_loss = None
        dataloader_iter = iter(self.dataloader)
        for _ in range(num_samples):
            try: inputs, targets = next(dataloader_iter)  # Get a batch
            except StopIteration: dataloader_iter = iter(self.dataloader); inputs, targets = next(dataloader_iter)
            outputs = self.model(inputs.flatten())  # Forward pass
            targets_oh = torch.zeros_like(outputs)
            targets_oh[targets] = 1
            loss = torch.norm((outputs - targets_oh) ** 2)
            total_loss = loss if total_loss is None else total_loss + loss
        total_loss = total_loss / num_samples

        if return_grad:
            grads = torch.autograd.grad(total_loss, self.model.parameters(), create_graph=True)
            grads = torch.cat([g.flatten() for g in grads]).unsqueeze(-1)
            return total_loss, grads.detach()
        else:
            return total_loss

    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        return torch.cat([p.flatten() for p in self.model.parameters()]).unsqueeze(-1)
    

In [67]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define a transform (convert images to tensors and normalize)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Download and load the training dataset
train_dataset = datasets.MNIST(root="./data", train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

dataloader_iter = iter(train_loader)
inputs, targets = next(dataloader_iter)
print("Input Shape", inputs.shape)
print("Target Shape", targets.shape)

Input Shape torch.Size([1, 1, 28, 28])
Target Shape torch.Size([1])


In [68]:
kwargs = {"dataloader": train_loader, "input_size": 784, "hidden_size": 20, "output_size": 10}

lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=True)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
# lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, LinearNNOptimizee, kwargs, num_epochs=10, time_horizon=200, discount=1e-3)
# params = test_LSTM(lstm_optimizer, LinearNNOptimizee, kwargs, time_horizon=100)

## Intermediate Optimizee

In [14]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X.shape

(442, 10)

In [15]:
class MLOptimizee(Optimizee):
    """
    Class for a generic linear neural network optimizee.
    """
    def __init__(self, X, y):
        """
        """
        super().__init__()
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.input_size = X.shape[1]
        try: self.output_size = y.shape[1]
        except: self.output_size = 1
        self.model = nn.Linear(self.input_size, self.output_size)
        self.num_params = sum(p.numel() for p in self.model.parameters())
    
    
    def set_params(self, params=None):
        """
        Given a tensor of shape (d,1), sets the parameters of the optimizee.
        """
        if params is not None:
            params = torch.flatten(params)
            # print("Params", params.shape)
            torch.nn.utils.vector_to_parameters(params, self.model.parameters())

    def compute_loss(self, params, return_grad=True):
        self.set_params(params)  # Set model parameters
        outputs = self.model(self.X)
        # print("Outputs", outputs[:10])
        loss = torch.norm((outputs - self.y) ** 2)/len(self.X)
        
        if return_grad:
            grads = torch.autograd.grad(loss, self.model.parameters(), create_graph=True)
            grads = torch.cat([g.flatten() for g in grads]).unsqueeze(-1)
            detached_grads = torch.tensor(grads.data, requires_grad=True)
            return loss, detached_grads
        return loss

    
    def all_parameters(self):
        """
        Returns all parameters of the optimizee, as a tensor of shape (d,1).
        """
        params = self.model.parameters()
        param_vector = torch.nn.utils.parameters_to_vector(params)
        # print("Param Vector", param_vector.shape)
        return param_vector.unsqueeze(-1)
    

In [19]:
# Delete folders if present
try:
    shutil.rmtree("diabetes_logs_train")
    shutil.rmtree("diabetes_logs")
except:
    pass

kwargs = {"X": X, "y": y}

lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=False)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
writer = SummaryWriter("diabetes_logs_train")
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, MLOptimizee, kwargs, num_epochs=10, time_horizon=500, discount=1e-3, writer=writer)
writer = SummaryWriter("diabetes_logs")
params = test_LSTM(lstm_optimizer, MLOptimizee, kwargs, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Param Shape torch.Size([11, 1])


  detached_grads = torch.tensor(grads.data, requires_grad=True)


Epoch [1/10], Cumulative Loss: 26409.3223, LR: 1.000e-02
Final parameters: [[54.90152  54.89626  52.220974 50.123592 47.395493 45.812008 46.071133
  45.56021  44.772232 44.44793  46.60088 ]]
Param Shape torch.Size([11, 1])
Epoch [2/10], Cumulative Loss: 26331.0410, LR: 1.000e-02
Final parameters: [[54.538387 54.84861  52.669174 49.63491  47.596508 45.71218  46.26066
  45.81694  44.862206 44.60245  46.91403 ]]
Param Shape torch.Size([11, 1])
Epoch [3/10], Cumulative Loss: 26410.4180, LR: 1.000e-02
Final parameters: [[54.73626  54.468693 52.492043 49.5874   47.63421  45.648804 46.158978
  45.83159  45.00227  44.18992  46.59435 ]]
Param Shape torch.Size([11, 1])
Epoch [4/10], Cumulative Loss: 26310.0742, LR: 1.000e-02
Final parameters: [[54.647552 54.359432 52.612976 49.762352 47.211872 45.929234 46.074646
  45.885403 44.96884  44.5567   46.9971  ]]
Param Shape torch.Size([11, 1])
Epoch [5/10], Cumulative Loss: 26319.2461, LR: 1.000e-02
Final parameters: [[54.74715  54.518055 52.177307 49

KeyboardInterrupt: 