## Imports and Classes

In [None]:
from tqdm.notebook import tqdm  # Import tqdm for Jupyter Notebook
from src.optimizee import *
from src.torch_utils import *

from torch.utils.tensorboard import SummaryWriter
# from optimizer_concurrent import *
# from train_concurrent import *

In [2]:
class LSTMConcurrent(nn.Module):
    """
    LSTM-based optimizer as described in the paper.
    """
    def __init__(self, num_optims, hidden_size=20, preproc=True, preproc_factor=10):
        super().__init__()
        self.hidden_size = hidden_size
        self.preproc = preproc
        self.preproc_factor = torch.tensor(preproc_factor)
        self.preproc_threshold = float(torch.exp(self.preproc_factor))
        
        self.input_size = 2*num_optims if preproc else 1*num_optims
        self.lstm = nn.LSTM(self.input_size, hidden_size, 2, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, 1)


    def forward(self, x, hidden_state):
        """
        Forward pass of the LSTM optimizer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_size).
            hidden_state (tuple): Hidden state of the LSTM (h, c).

        Returns:
            torch.Tensor: Output updates of shape (batch_size, sequence_length, 1).
            tuple: Updated hidden state.
        """
        if self.preproc: x = self.preprocess_gradients(x)

        # print("Preprocess Shape", x.shape)
        out, hidden_state = self.lstm(x, hidden_state)
        out = self.output_layer(out)

        return out, hidden_state



    def preprocess_gradients(self, gradients):
        """ Applies log transformation & sign extraction to gradients, moving to CUDA if available. """

        gradients = gradients.data  # Extract raw gradient data
        print("Gradients Shape", gradients.shape)
        if len(gradients.size()) != 2: raise Exception
        
        param_size = gradients.size(0)
        num_optims = gradients.size(1)

        preprocessed = torch.zeros(param_size, 2*num_optims)

        for i in range(num_optims):
            gradient = gradients[:,i]
            keep_grads = (torch.abs(gradient) >= self.preproc_threshold)
        
            # Log transformation for large gradients
        
            preprocessed[keep_grads, 2*i] = (torch.log(torch.abs(gradient[keep_grads]) + 1e-8) / self.preproc_factor)
            preprocessed[keep_grads, 2*i+1] = torch.sign(gradient[keep_grads])

            # Direct scaling for small gradients
            preprocessed[~keep_grads, 2*i] = -1
            preprocessed[~keep_grads, 2*i+1] = (float(torch.exp(self.preproc_factor)) * gradient[~keep_grads])

        # print(preprocessed.shape)
        return torch.tensor(preprocessed).to(gradients.device)
    

    def initialize_hidden_state(self):
        # Initialize hidden & cell states for LSTM (one per parameter)
        self.h0 = to_cuda(torch.zeros(2, self.hidden_size))
        self.c0 = to_cuda(torch.zeros(2, self.hidden_size))
        return (self.h0, self.c0)
    

In [3]:
def initialize_optimizees(optimizee_cls, optimizee_kwargs, num_optimizees=10, noise='equal'):
    optimizees = []
    for i in range(num_optimizees):
        if noise == 'equal':
            optimizees.append(optimizee_cls(**optimizee_kwargs))
        else:
            optimizee_kwargs['noise_std'] = 0.01 * (i+1)
            optimizees.append(optimizee_cls(**optimizee_kwargs))
    return optimizees




def train_LSTM(lstm_optimizer, meta_optimizer, optimizee_class, optimizee_kwargs, num_optimizees=1, num_epochs=500, time_horizon=200, discount=1, scheduler = None, noise='equal', writer=None):
    
    if scheduler is None:
        scheduler = torch.optim.lr_scheduler.ConstantLR(meta_optimizer, factor=1.0, total_iters=num_epochs)

    
    with tqdm(range(num_epochs), desc="Training Progress") as pbar:
        for epoch in pbar:
            # Initialize optimizee parameters
            optimizees = initialize_optimizees(optimizee_class, optimizee_kwargs, num_optimizees, noise=noise)
            params = optimizees[0].get_initial_params()

            params_shape = params.shape
            params = params.flatten()

            hidden_state = lstm_optimizer.initialize_hidden_state()

            cumulative_loss = None             
            for t in range(time_horizon):
                gradients = []
                for i in range(num_optimizees):
                    optimizee = optimizees[i]
                    loss = optimizee.compute_loss(params)
                    if i == 0: cumulative_loss = loss*discount**(time_horizon) if cumulative_loss is None else cumulative_loss + loss*discount**(time_horizon-t)
                    
                    # Compute gradients of the loss w.r.t. the parameters
                    grad_params = torch.autograd.grad(loss, params, create_graph=True)[0]
                    grad_params = grad_params.detach()
                    gradients.append(grad_params.flatten().squeeze(-1))

                # Stack gradients
                grad_params = torch.stack(gradients).T
                # print("Grads", grad_params.shape)
                update, hidden_state = lstm_optimizer(grad_params, hidden_state)
                # print("Update", update.shape)
                # print("Params", params.shape)
                params = params - update
                params = params.reshape(params_shape)

                
            # Backpropagation through time (BPTT)
            if writer: writer.add_scalar("Loss", cumulative_loss, epoch)
            meta_optimizer.zero_grad()
            cumulative_loss.backward()
            torch.nn.utils.clip_grad_norm_(lstm_optimizer.parameters(), 1)
            meta_optimizer.step()
            scheduler.step()

            # Update progress bar
            pbar.set_postfix(loss=cumulative_loss.item())
            if (epoch + 1) % 50 == 0:
                current_lr = meta_optimizer.param_groups[0]['lr']
                print(f"Epoch [{epoch+1}/{num_epochs}], Cumulative Loss: {cumulative_loss.item():.4f}, LR: {current_lr:.3e}")
                print(f"Final parameters: {params.detach().numpy().T}")
                
    print("\nTraining complete!")
    return lstm_optimizer




def test_LSTM(lstm_optimizer, optimizee_class, optimizee_kwargs, num_optimizees=1, time_horizon=200, noise='equal', writer=None):
    optimizees = initialize_optimizees(optimizee_class, optimizee_kwargs, num_optimizees, noise=noise)    
    params = optimizees[0].get_initial_params()
    hidden_state = lstm_optimizer.initialize_hidden_state()
    for t in range(time_horizon):
        gradients = []
        for i in range(num_optimizees):
            optimizee = optimizees[i]
            loss = optimizee.compute_loss(params)
            if writer and i==0: writer.add_scalar("Loss", loss, t)
            grad_params = torch.autograd.grad(loss, params, create_graph=True)[0]
            grad_params = grad_params.detach()
            gradients.append(grad_params.squeeze(-1))
        
        grad_params = torch.stack(gradients).T
        if len(grad_params.shape)==1: grad_params = grad_params.unsqueeze(-1)

        # print(grad_params.shape)
        updates, hidden_state = lstm_optimizer(grad_params, hidden_state)
        params = params - updates 

    print(f"Final parameters: {params.detach().numpy().T}")
    return params

## 1 Optimizee Tests

In [6]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
print("W", W)

lstm_optimizer = LSTMConcurrent(num_optims=1)
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.0001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9)
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000)

W [[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
   0.95008842 -0.15135721 -0.10321885  0.4105985 ]
 [ 0.14404357  1.45427351  0.76103773  0.12167502  0.44386323  0.33367433
   1.49407907 -0.20515826  0.3130677  -0.85409574]
 [-2.55298982  0.6536186   0.8644362  -0.74216502  2.26975462 -1.45436567
   0.04575852 -0.18718385  1.53277921  1.46935877]
 [ 0.15494743  0.37816252 -0.88778575 -1.98079647 -0.34791215  0.15634897
   1.23029068  1.20237985 -0.38732682 -0.30230275]
 [-1.04855297 -1.42001794 -1.70627019  1.9507754  -0.50965218 -0.4380743
  -1.25279536  0.77749036 -1.61389785 -0.21274028]
 [-0.89546656  0.3869025  -0.51080514 -1.18063218 -0.02818223  0.42833187
   0.06651722  0.3024719  -0.63432209 -0.36274117]
 [-0.67246045 -0.35955316 -0.81314628 -1.7262826   0.17742614 -0.40178094
  -1.63019835  0.46278226 -0.90729836  0.0519454 ]
 [ 0.72909056  0.12898291  1.13940068 -1.23482582  0.40234164 -0.68481009
  -0.87079715 -0.57884966 -0.31155253  0.05616534]

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed)


Epoch [50/500], Cumulative Loss: 678.0087, LR: 0.0001
Final parameters: [[-1.7408055  -0.6848107   1.9327322   1.7622056   1.1565609   2.144124
  -0.38439995 -0.5949832  -0.39030692  0.3918745 ]]
Epoch [100/500], Cumulative Loss: 1.7167, LR: 0.0001
Final parameters: [[1.2065135  1.3939402  0.7177888  1.0273954  1.0677747  1.0888687
  0.8006659  0.76449645 0.997844   1.1313748 ]]
Epoch [150/500], Cumulative Loss: 40.3085, LR: 0.0001
Final parameters: [[1.3239648  0.49238402 0.9104134  0.80154693 0.49345618 0.6382917
  1.8699192  1.1657548  0.9001383  3.0564482 ]]
Epoch [200/500], Cumulative Loss: 1.7670, LR: 0.0001
Final parameters: [[1.1339531  0.5654306  0.88947594 1.0246433  1.7561078  1.4821682
  0.830778   1.1263865  1.392892   0.4249072 ]]
Epoch [250/500], Cumulative Loss: 42.9599, LR: 0.0001
Final parameters: [[ 1.148145    0.8831059   0.8568987   1.078455   -0.02480925  0.47527987
   1.9699063   0.9989253   0.73728865  3.3624678 ]]
Epoch [300/500], Cumulative Loss: 1.6532, LR: 0

tensor([[0.8799],
        [0.0421],
        [1.4925],
        [0.8432],
        [1.3703],
        [1.2592],
        [1.0072],
        [1.4704],
        [1.2052],
        [0.2789]], grad_fn=<SubBackward0>)

In [21]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))

lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_e3")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer)
writer = SummaryWriter("runs/LSTMC_e3")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed).to(gradients.device)


Epoch [50/500], Cumulative Loss: 2.2705, LR: 1.000e-03
Final parameters: [[1.1133772  1.149434   0.84593624 0.919465   0.92678416 0.87439495
  0.79249763 0.8574801  0.8922913  0.9268453 ]]
Epoch [100/500], Cumulative Loss: 2.5885, LR: 1.000e-03
Final parameters: [[1.0231252  1.2983627  0.69567657 1.04205    0.80693126 0.968395
  0.98893225 0.7866773  0.936258   1.2409269 ]]
Epoch [150/500], Cumulative Loss: 1.8983, LR: 1.000e-03
Final parameters: [[0.9064889 0.3938859 1.2137818 0.9260122 1.4346564 1.2626358 0.9020556
  1.275489  1.2780683 0.501418 ]]
Epoch [200/500], Cumulative Loss: 2.2537, LR: 1.000e-03
Final parameters: [[0.8352225  0.49092412 0.91841835 0.89361143 1.2851164  1.1536566
  0.9083565  1.0939188  1.1350919  0.632731  ]]
Epoch [250/500], Cumulative Loss: 3.5538, LR: 1.000e-03
Final parameters: [[0.8916057  0.3681173  1.6546191  0.97006637 1.0988685  1.0951537
  1.2061218  1.4076147  0.9554615  0.582952  ]]
Epoch [300/500], Cumulative Loss: 2.9312, LR: 1.000e-03
Final par

In [16]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_e4")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.0001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer)
writer = SummaryWriter("runs/LSTMC_e4")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed)


Epoch [50/500], Cumulative Loss: 678.0087, LR: 1.000e-04
Final parameters: [[-1.7408055  -0.6848107   1.9327322   1.7622056   1.1565609   2.144124
  -0.38439995 -0.5949832  -0.39030692  0.3918745 ]]
Epoch [100/500], Cumulative Loss: 1.7167, LR: 1.000e-04
Final parameters: [[1.2065135  1.3939402  0.7177888  1.0273954  1.0677747  1.0888687
  0.8006659  0.76449645 0.997844   1.1313748 ]]
Epoch [150/500], Cumulative Loss: 40.3085, LR: 1.000e-04
Final parameters: [[1.3239648  0.49238402 0.9104134  0.80154693 0.49345618 0.6382917
  1.8699192  1.1657548  0.9001383  3.0564482 ]]
Epoch [200/500], Cumulative Loss: 1.7670, LR: 1.000e-04
Final parameters: [[1.1339531  0.5654306  0.88947594 1.0246433  1.7561078  1.4821682
  0.830778   1.1263865  1.392892   0.4249072 ]]
Epoch [250/500], Cumulative Loss: 42.9599, LR: 1.000e-04
Final parameters: [[ 1.148145    0.8831059   0.8568987   1.078455   -0.02480925  0.47527987
   1.9699063   0.9989253   0.73728865  3.3624678 ]]
Epoch [300/500], Cumulative Loss

In [17]:
torch.manual_seed(0)
np.random.seed(0)
n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_Step_Schedule")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
lr_scheduler = torch.optim.lr_scheduler.StepLR(meta_optimizer, step_size=100, gamma=0.1)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer, scheduler=lr_scheduler)
writer = SummaryWriter("runs/LSTMC_Step_Schedule")
result = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed)


Epoch [50/500], Cumulative Loss: 12.0202, LR: 1.000e-02
Final parameters: [[ 0.8882733  -1.0680922   2.1956627   0.77448887  2.196385    1.7232606
   1.02616     1.9731619   1.6679549  -0.37158632]]
Epoch [100/500], Cumulative Loss: 22.0423, LR: 1.000e-03
Final parameters: [[ 1.0604126   3.5712333   0.46037132  1.3474061  -1.210639   -0.7595431
   1.4687026  -0.59579945 -0.5196248   3.8281374 ]]
Epoch [150/500], Cumulative Loss: 20.8341, LR: 1.000e-03
Final parameters: [[1.3253913  1.4245012  0.5266191  0.9558802  0.93468213 0.80034304
  0.48085028 0.75322926 0.8159064  1.1726106 ]]
Epoch [200/500], Cumulative Loss: 22.1335, LR: 1.000e-04
Final parameters: [[0.7298634  1.9048271  1.6803379  1.2692845  0.12147462 0.43305537
  1.6555943  0.7352497  0.24044266 2.3438215 ]]
Epoch [250/500], Cumulative Loss: 10.7935, LR: 1.000e-04
Final parameters: [[0.56660473 0.04903781 1.5302255  0.8849963  1.4943315  1.2028708
  1.384715   1.4739088  1.2819226  0.30835834]]
Epoch [300/500], Cumulative L

In [11]:
torch.manual_seed(0)
np.random.seed(0)
n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))
lstm_optimizer = LSTMConcurrent(num_optims=1)
writer = SummaryWriter("train/LSTMC_Exp_Schedule")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.01)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(meta_optimizer, gamma=0.95)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer, scheduler=lr_scheduler)
writer = SummaryWriter("runs/LSTMC_Exp_Schedule")
result = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

  return torch.tensor(preprocessed)


Epoch [50/500], Cumulative Loss: 8.2322, LR: 6.050e-03
Final parameters: [[1.0652342  1.968668   0.82195055 1.1188666  0.56229705 0.5385603
  0.95032275 0.5598425  0.87714374 1.960539  ]]
Epoch [100/500], Cumulative Loss: 11.0390, LR: 3.660e-03
Final parameters: [[1.1109518  1.9364738  0.7058257  1.2155671  0.7280866  0.86177427
  1.304956   0.5621171  0.80596465 1.7954679 ]]
Epoch [150/500], Cumulative Loss: 10.9900, LR: 2.215e-03
Final parameters: [[0.6852528  0.2635373  1.1815398  0.7991682  1.2164308  1.1358552
  0.82237744 1.1198775  0.9891465  0.5303604 ]]
Epoch [200/500], Cumulative Loss: 8.2464, LR: 1.340e-03
Final parameters: [[0.9026416  0.26370406 1.2780409  1.0735267  1.3979259  1.3850411
  1.1162341  1.3254683  1.3142105  0.70383805]]
Epoch [250/500], Cumulative Loss: 6.6639, LR: 8.106e-04
Final parameters: [[ 0.7653771  -0.7658654   1.9213499   0.90522426  1.8969517   1.5288551
   1.1529952   2.00451     1.3669827  -0.10523983]]
Epoch [300/500], Cumulative Loss: 7.3453, L

In [18]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))

lstm_optimizer = LSTMConcurrent(num_optims=1, preproc=False)
writer = SummaryWriter("train/LSTMC_no_preproc")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_epochs=500, time_horizon=500, discount=0.9, writer=writer)
writer = SummaryWriter("runs/LSTMC_no_preproc")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, time_horizon=1000, writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Epoch [50/500], Cumulative Loss: 0.0452, LR: 1.000e-03
Final parameters: [[1.0031648  0.5671853  1.13669    0.96303606 1.3589103  1.250922
  0.96590894 1.2179585  1.220603   0.5857313 ]]
Epoch [100/500], Cumulative Loss: 2.5647, LR: 1.000e-03
Final parameters: [[ 1.1828347   2.91623     0.05856757  1.2521417  -0.04200763  0.33175182
   1.0580771   0.10083254  0.5222156   2.722088  ]]
Epoch [150/500], Cumulative Loss: 0.2814, LR: 1.000e-03
Final parameters: [[0.974594   1.4751192  0.85049856 1.0327634  0.5316286  0.65778404
  1.0701429  0.7535344  0.7166051  1.5164716 ]]
Epoch [200/500], Cumulative Loss: 2.6568, LR: 1.000e-03
Final parameters: [[1.0777644  2.4571126  0.34948862 1.167261   0.06889424 0.34417182
  1.086774   0.3207243  0.5404997  2.3271315 ]]
Epoch [250/500], Cumulative Loss: 0.1593, LR: 1.000e-03
Final parameters: [[1.0301814  1.3817468  0.85711896 1.0263888  0.7105358  0.77801305
  1.0164049  0.800079   0.82843816 1.3309232 ]]
Epoch [300/500], Cumulative Loss: 0.1715, L

## Concurrent Tests

In [25]:
torch.manual_seed(0)
np.random.seed(0)

n = 10
W = np.random.randn(n, n)
theta0 = np.ones((n, 1))

lstm_optimizer = LSTMConcurrent(num_optims=10, preproc=False)
writer = SummaryWriter("train_multi/LSTMC_no_preproc")
meta_optimizer = optim.Adam(lstm_optimizer.parameters(), lr=0.001)
lstm_optimizer = train_LSTM(lstm_optimizer, meta_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_optimizees=10, num_epochs=500, time_horizon=500, discount=0.9, noise='increasing', writer=writer)
writer = SummaryWriter("runs_multi/LSTMC_no_preproc")
params = test_LSTM(lstm_optimizer, QuadraticOptimizee, {"W": W, "theta0": theta0}, num_optimizees=10, time_horizon=1000, noise='increasing', writer=writer)

Training Progress:   0%|          | 0/500 [00:00<?, ?it/s]

Epoch [50/500], Cumulative Loss: 15.7948, LR: 1.000e-03
Final parameters: [[ 1.7238102   3.4446235  -0.63094324  1.3475721   0.47865286  0.5868858
   0.43750548 -0.41637543  0.7697473   2.2933662 ]]
Epoch [100/500], Cumulative Loss: 5.6111, LR: 1.000e-03
Final parameters: [[ 0.9344667  -0.20619264  1.3319949   0.9413942   2.094924    1.862051
   0.94611895  1.6657186   1.7327039  -0.09157899]]
Epoch [150/500], Cumulative Loss: 8.0623, LR: 1.000e-03
Final parameters: [[0.98298573 1.0032356  0.5673961  0.84742564 0.9136658  0.9211762
  0.8214606  0.7444358  0.9694031  0.994177  ]]
Epoch [200/500], Cumulative Loss: 5.5993, LR: 1.000e-03
Final parameters: [[1.0103592  1.7809287  0.26980695 0.9928391  0.5483419  0.6192161
  0.8380671  0.3389579  0.7091645  1.4381074 ]]
Epoch [250/500], Cumulative Loss: 7.3957, LR: 1.000e-03
Final parameters: [[1.187883   1.7614822  0.47935507 1.2404308  1.1293045  1.0955939
  1.0683199  0.71478975 1.2030783  1.5425831 ]]
Epoch [300/500], Cumulative Loss: 6.