In [1]:
from tqdm.notebook import tqdm  # Import tqdm for Jupyter Notebook
from src.optimizee import *
from src.torch_utils import *

from torch.utils.tensorboard import SummaryWriter

In [7]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))

# Create the quadratic function optimizee
optimizee = QuadraticOptimizee(W, theta0)
params = optimizee.get_initial_params()

optimizer = optim.Adam([params], lr=0.1)
writer = SummaryWriter("runs/Adam")
# Training loop
num_epochs = 1000
for epoch in tqdm(range(num_epochs), desc="Training Progress", unit="epoch"):
    optimizer.zero_grad()
    loss = optimizee.compute_loss(params)
    writer.add_scalar("Loss", loss,epoch)
    loss.backward()
    optimizer.step()

# Final parameters and loss
print("\nTraining complete!")
print(f"Final parameters: {params.detach().numpy().T}")
print(f"Final loss: {loss.item():.4f}")

Training Progress:   0%|          | 0/1000 [00:00<?, ?epoch/s]


Training complete!
Final parameters: [[1.0655398  1.6188935  0.6811273  1.0631032  0.6347599  0.7418846
  0.98628175 0.6697677  0.81006145 1.5054967 ]]
Final loss: 0.0005


In [3]:
class LSTMOptimizer(nn.Module):
    """
    LSTM-based optimizer as described in the paper.
    """
    def __init__(self, hidden_size=20, preproc=True, preproc_factor=10):
        super().__init__()
        self.hidden_size = hidden_size
        self.preproc = preproc
        self.preproc_factor = torch.tensor(preproc_factor)
        self.preproc_threshold = float(torch.exp(self.preproc_factor))
        
        self.input_size = 2 if preproc else 1
        self.lstm = nn.LSTM(self.input_size, hidden_size, 2, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, 1)


    def forward(self, x, hidden_state):
        """
        Forward pass of the LSTM optimizer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_size).
            hidden_state (tuple): Hidden state of the LSTM (h, c).

        Returns:
            torch.Tensor: Output updates of shape (batch_size, sequence_length, 1).
            tuple: Updated hidden state.
        """
        if self.preproc: x = self.preprocess_gradients(x)

        # print("Preprocess Shape", x.shape)
        out, hidden_state = self.lstm(x, hidden_state)
        out = self.output_layer(out)
        # print("Output Shape", out.shape)
        return out, hidden_state



    def preprocess_gradients(self, gradients):
        """ Applies log transformation & sign extraction to gradients, moving to CUDA if available. """

        gradients = gradients.data  # Extract raw gradient data
        
        param_size = gradients.size(0)
        preprocessed = torch.zeros(param_size, 2)

        # Identify large gradients (above threshold)
        keep_grads = (torch.abs(gradients) >= self.preproc_threshold).squeeze()
        
        # Log transformation for large gradients
        preprocessed[:, 0][keep_grads] = (torch.log(torch.abs(gradients[keep_grads]) + 1e-8) / self.preproc_factor).squeeze()
        preprocessed[:, 1][keep_grads] = torch.sign(gradients[keep_grads]).squeeze()

        # Direct scaling for small gradients
        preprocessed[:, 0][~keep_grads] = -1
        preprocessed[:, 1][~keep_grads] = (float(torch.exp(self.preproc_factor)) * gradients[~keep_grads]).squeeze()

        return torch.tensor(preprocessed)
    

    def initialize_hidden_state(self):
        # Initialize hidden & cell states for LSTM (one per parameter)
        self.h0 = to_cuda(torch.zeros(2, self.hidden_size))
        self.c0 = to_cuda(torch.zeros(2, self.hidden_size))
        return (self.h0, self.c0)

In [4]:
def train_LSTM(lstm_optimizer, meta_optimizer, optimizee_class, optimizee_kwargs, num_epochs=500, time_horizon=200, discount=1, writer=None, scheduler=None):
    if scheduler is None:
        scheduler = torch.optim.lr_scheduler.ConstantLR(meta_optimizer, factor=1.0, total_iters=num_epochs)
    
    with tqdm(range(num_epochs), desc="Training Progress") as pbar:
        for epoch in pbar:
            # Initialize optimizee parameters
            optimizee = optimizee_class(**optimizee_kwargs)
            params = optimizee.get_initial_params()
            hidden_state = lstm_optimizer.initialize_hidden_state()

            cumulative_loss = None
            for t in range(time_horizon):
                loss = optimizee.compute_loss(params)
                cumulative_loss = loss*discount**(time_horizon) if cumulative_loss is None else cumulative_loss + loss*discount**(time_horizon-t)

                # Compute gradients of the loss w.r.t. the parameters
                grad_params = torch.autograd.grad(loss, params, create_graph=True)[0]
                grad_params = grad_params.detach()
                # print("Grads", grad_params.shape)
                updates, hidden_state = lstm_optimizer(grad_params, hidden_state)
                params = params - updates  # Update parameters

            # Backpropagation through time (BPTT)
            # cumulative_loss = loss

            if writer: writer.add_scalar("Loss", cumulative_loss, epoch)
            meta_optimizer.zero_grad()
            cumulative_loss.backward()
            torch.nn.utils.clip_grad_norm_(lstm_optimizer.parameters(), 1)
            meta_optimizer.step()
            scheduler.step()

            # Update progress bar
            pbar.set_postfix(loss=cumulative_loss.item())
            if (epoch + 1) % 50 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Cumulative Loss: {cumulative_loss.item():.4f}, True Loss: {np.linalg.norm(params.detach().numpy() - theta0):.4f}")
                print(f"Final parameters: {params.detach().numpy().T}")
                
    print("\nTraining complete!")
    return lstm_optimizer


def test_LSTM(lstm_optimizer, optimizee_cls, optimizee_kwargs, time_horizon=200, writer=None):
    optimizee = optimizee_cls(**optimizee_kwargs)
    params = optimizee.get_initial_params()
    hidden_state = lstm_optimizer.initialize_hidden_state()
    for t in range(time_horizon):
        loss = optimizee.compute_loss(params)
        if writer: writer.add_scalar("Loss", loss, t)

        grad_params = torch.autograd.grad(loss, params, create_graph=True)[0]
        grad_params = grad_params.detach()

        updates, hidden_state = lstm_optimizer(grad_params, hidden_state)
        params = params - updates 

    print(f"Final parameters: {params.detach().numpy().T}")
    print(f"Final loss: {optimizee.compute_loss(params).item():.4f}")

In [6]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
print("W", W)

lstm_optimizer = LSTMOptimizer()
writer = SummaryWriter("train/LSTM")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.0001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer)
writer = SummaryWriter("runs/LSTM")
test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

W [[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
   0.95008842 -0.15135721 -0.10321885  0.4105985 ]
 [ 0.14404357  1.45427351  0.76103773  0.12167502  0.44386323  0.33367433
   1.49407907 -0.20515826  0.3130677  -0.85409574]
 [-2.55298982  0.6536186   0.8644362  -0.74216502  2.26975462 -1.45436567
   0.04575852 -0.18718385  1.53277921  1.46935877]
 [ 0.15494743  0.37816252 -0.88778575 -1.98079647 -0.34791215  0.15634897
   1.23029068  1.20237985 -0.38732682 -0.30230275]
 [-1.04855297 -1.42001794 -1.70627019  1.9507754  -0.50965218 -0.4380743
  -1.25279536  0.77749036 -1.61389785 -0.21274028]
 [-0.89546656  0.3869025  -0.51080514 -1.18063218 -0.02818223  0.42833187
   0.06651722  0.3024719  -0.63432209 -0.36274117]
 [-0.67246045 -0.35955316 -0.81314628 -1.7262826   0.17742614 -0.40178094
  -1.63019835  0.46278226 -0.90729836  0.0519454 ]
 [ 0.72909056  0.12898291  1.13940068 -1.23482582  0.40234164 -0.68481009
  -0.87079715 -0.57884966 -0.31155253  0.05616534]

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed)


Epoch [50/500], Cumulative Loss: 678.0087, True Loss: 4.4608
Final parameters: [[-1.7408055  -0.6848107   1.9327322   1.7622056   1.1565609   2.144124
  -0.38439995 -0.5949832  -0.39030692  0.3918745 ]]
Epoch [100/500], Cumulative Loss: 1.7167, True Loss: 0.6350
Final parameters: [[1.2065135  1.3939402  0.7177888  1.0273954  1.0677747  1.0888687
  0.8006659  0.76449645 0.997844   1.1313748 ]]
Epoch [150/500], Cumulative Loss: 40.3085, True Loss: 2.4126
Final parameters: [[1.3239648  0.49238402 0.9104134  0.80154693 0.49345618 0.6382917
  1.8699192  1.1657548  0.9001383  3.0564482 ]]
Epoch [200/500], Cumulative Loss: 1.7670, True Loss: 1.2464
Final parameters: [[1.1339531  0.5654306  0.88947594 1.0246433  1.7561078  1.4821682
  0.830778   1.1263865  1.392892   0.4249072 ]]
Epoch [250/500], Cumulative Loss: 42.9599, True Loss: 2.8247
Final parameters: [[ 1.148145    0.8831059   0.8568987   1.078455   -0.02480925  0.47527987
   1.9699063   0.9989253   0.73728865  3.3624678 ]]
Epoch [300/5

In [7]:
lstm_optimizer = LSTMOptimizer()
writer = SummaryWriter("train/LSTM_Scheduled")
scheduler = torch.optim.lr_scheduler.ExponentialLR(meta_optimizer, gamma=0.99)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.0001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=200, discount=0.9,scheduler=scheduler, writer=writer)
writer = SummaryWriter("runs/LSTM_Scheduled")
test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed)


Epoch [50/500], Cumulative Loss: 1.0409, True Loss: 1.4085
Epoch [100/500], Cumulative Loss: 0.2729, True Loss: 0.3145
Epoch [150/500], Cumulative Loss: 0.1629, True Loss: 0.6578
Epoch [200/500], Cumulative Loss: 0.2133, True Loss: 0.6015
Epoch [250/500], Cumulative Loss: 0.3481, True Loss: 0.5417
Epoch [300/500], Cumulative Loss: 0.5037, True Loss: 0.9331
Epoch [350/500], Cumulative Loss: 0.1810, True Loss: 0.5045
Epoch [400/500], Cumulative Loss: 0.8420, True Loss: 3.0661
Epoch [450/500], Cumulative Loss: 1.3890, True Loss: 0.8003
Epoch [500/500], Cumulative Loss: 1.8909, True Loss: 1.4473

Training complete!
Final parameters: [[0.9348897  1.0122341  0.98700523 1.0252625  0.8976317  1.0598589
  1.0697854  0.88492054 1.0005735  0.9322422 ]]
Final loss: 0.0214


In [8]:
W = np.random.randn(n, n)
theta0 = np.ones((n, 1)) * 2
writer = SummaryWriter("runs/LSTM_on_Different_Theta0")
test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

  return torch.tensor(preprocessed)


Final parameters: [[2.1140873 2.2084498 1.439697  2.131821  2.1317496 2.820525  1.6001282
  2.0870872 2.185955  2.0423636]]
Final loss: 0.0271


In [8]:
# Training setup
n = 20  # Dimension of theta
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))*1

writer = SummaryWriter("runs/LSTM_Different_n")
test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

  return torch.tensor(preprocessed)


Final parameters: [[ 1.4611137   0.24016447  1.6320393   2.376492    1.0171142   0.8914471
   0.7716255   1.0323058  -0.4345507   0.08845386  2.3286564   0.8356896
   1.0963686   1.5145445   0.10581642  1.2890223   0.61576074  1.3796409
   1.2943015   1.0529432 ]]
Final loss: 0.3115
