In [14]:
!pip install torch wandb huggingface_hub



In [29]:
##################################################
# 1) Imports & Basic Setup
##################################################
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import math

# Attempt to load secrets from colab
try:
    from google.colab import userdata
    # 1) HF_TOKEN
    hf_tok = userdata.get("HF_TOKEN")
    if hf_tok:
        os.environ["HF_TOKEN"] = hf_tok
        print("Set HF_TOKEN from colab secrets.")
    # 2) W&B
    wandb_key = userdata.get("wandb")
    if wandb_key:
        os.environ["wandb"] = wandb_key
        print("Set wandb from colab secrets.")
except:
    pass

hf_token  = os.environ.get("HF_TOKEN", None)
wandb_key = os.environ.get("wandb", None)
print("hf_token:", bool(hf_token))
print("wandb_key:", bool(wandb_key))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

##################################################
# 2) Larger MLP (~9.6k params)
##################################################
class LargeMLP(nn.Module):
    """
    We'll downsample MNIST to 16x16 => input dim=256
    Then 2 hidden layers of 32 each => total ~9.6k parameters.

    Calculation:
      fc1: (256 * 32) + 32 = 8192 + 32 = 8224
      fc2: (32 * 32) + 32  = 1024 + 32 = 1056
      fc3: (32 * 10) + 10  = 320 + 10 = 330
      total => 8224 + 1056 + 330 = 9610
    """
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(256, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, 10)

    def forward(self, x):
        # x shape: [B,1,16,16]
        x = x.view(x.size(0), -1)  # => [B,256]
        x = torch.relu(self.fc1(x)) # => [B,32]
        x = torch.relu(self.fc2(x)) # => [B,32]
        x = self.fc3(x)             # => [B,10]
        return x

##################################################
# 2.1) Compute Global Gradient Norm
##################################################
def compute_global_grad_norm(model):
    """
    Sums the squared L2-norm of each param.grad, then sqrt.
    This gives the 'global' gradient norm across all parameters.
    """
    total_sq = 0.0
    for p in model.parameters():
        if p.grad is not None:
            g = p.grad.data
            total_sq += g.norm(2).item()**2
    return math.sqrt(total_sq)

##################################################
# 3) Naive Newton Optimizer
##################################################
class NaiveNewtonOptimizer(torch.optim.Optimizer):
    """
    Dense Hessian => O(N^3). Must call step(closure).
    Great for demonstration with small networks only.
    """
    def __init__(self, params, lr=1.0, tol=1e-6):
        defaults = dict(lr=lr, tol=tol)
        super().__init__(params, defaults)

    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("NaiveNewtonOptimizer needs a closure returning the loss (tensor).")

        loss = closure()
        loss.backward(create_graph=True)

        for group in self.param_groups:
            lr = group['lr']
            tol = group['tol']

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.view(-1)
                if grad.norm() < tol:
                    continue

                n = grad.numel()
                # Build Hessian
                H = []
                for i in range(n):
                    g_i = grad[i]
                    p.grad = None
                    g_i.backward(retain_graph=True)
                    H_i = p.grad.view(-1).clone()
                    H.append(H_i)
                    p.grad = None
                H = torch.stack(H, dim=1)  # => [n, n]

                # Solve H dx = grad
                try:
                    dx, _ = torch.solve(grad.unsqueeze(1), H)
                    dx = dx.squeeze(1)
                except RuntimeError:
                    dx = grad  # fallback

                # Safely do in-place update using .data
                p.data.sub_((lr * dx).view(p.shape))

        return loss

##################################################
# 3) Naive Gradient Descent Optimizer
##################################################
class NaiveGradientDescent(torch.optim.Optimizer):
    """
    A pure "vanilla" Gradient Descent:
    p <- p - lr * grad(p)

    Must call step(closure) where the closure:
      - zeroes grads
      - does forward pass
      - returns the loss (tensor)
    Then we do normal .backward() for the first derivative only.
    """
    def __init__(self, params, lr=0.01):
        defaults = dict(lr=lr)
        super().__init__(params, defaults)

    def step(self, closure=None):
        if closure is None:
            raise RuntimeError("NaiveGradientDescent needs a closure returning the loss (tensor).")

        # 1) Recompute the forward pass
        loss = closure()
        # 2) Normal backward => first derivatives
        loss.backward()

        # 3) Update
        for group in self.param_groups:
            lr = group['lr']
            for p in group['params']:
                if p.grad is None:
                    continue
                p.data.sub_(lr * p.grad)

        return loss


##################################################
# 4) DataLoaders with downsampling to 16x16
##################################################
def get_mnist_loaders(batch_size=64):
    transform = transforms.Compose([
        transforms.Resize((16,16)),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    train_ds = datasets.MNIST(root=".", train=True, download=True, transform=transform)
    test_ds  = datasets.MNIST(root=".", train=False, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader  = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

##################################################
# 5) Train function
##################################################
def train_model(
    optimizer_name: str = "",
    learning_rate: float = 1e-2,
    epochs: int = 2,
    batch_size: int = 64,
    wandb_project: str = ""
):
    """
    We'll:
      - Create ~9.6k param MLP (16x16 input => 2 hidden layers of 32)
      - Use "gd", "newton", or "sgd"
      - Possibly log each step's loss to wandb
      - Also log:
        1) global grad norm
        2) step size
        3) relative improvement
    """
    model = LargeMLP().to(device)
    train_loader, test_loader = get_mnist_loaders(batch_size)
    criterion = nn.CrossEntropyLoss()

    # W&B
    wandb_key = os.environ.get("wandb", None)
    do_wandb = (wandb_project != "") and (wandb_key is not None)
    if do_wandb:
        import wandb
        wandb.login(key=wandb_key)
        wandb.init(project=wandb_project, config={
            "optimizer_name": optimizer_name,
            "learning_rate": learning_rate,
            "epochs": epochs,
            "batch_size": batch_size
        })

    # pick optimizer
    if optimizer_name.lower() == "gd":
        opt = NaiveGradientDescent(model.parameters(), lr=learning_rate)
    elif optimizer_name.lower() == "newton":
        opt = NaiveNewtonOptimizer(model.parameters(), lr=learning_rate)
    elif optimizer_name.lower() == "sgd":
        opt = torch.optim.SGD(model.parameters(), lr=learning_rate)
    else:
        raise ValueError(f"Unknown optimizer {optimizer_name}")

    global_step = 0
    initial_loss_val = None  # track initial for relative improvement

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for batch_idx, (data, targets) in enumerate(train_loader):
            data, targets = data.to(device), targets.to(device)

            # 1) Save old parameters for step size calculation
            old_params = [p.detach().clone() for p in model.parameters()]

            def closure():
                opt.zero_grad()
                outputs = model(data)
                loss_t = criterion(outputs, targets)
                return loss_t

            # 2) Step
            loss_tensor = opt.step(closure=closure)
            loss_val = loss_tensor.item()
            total_loss += loss_val

            # 3) If first time, store initial loss
            if initial_loss_val is None:
                initial_loss_val = loss_val

            # 4) Compute global grad norm
            grad_norm = compute_global_grad_norm(model)

            # 5) Compute step size
            sum_sq = 0.0
            for p, old_p in zip(model.parameters(), old_params):
                delta = (p.detach() - old_p).view(-1)
                sum_sq += delta.norm(2).item()**2
            step_size = math.sqrt(sum_sq)

            # 6) Compute relative improvement from the initial iteration
            # (if initial_loss_val was 2.3 and now it's 1.2, improvement = ~47.8%)
            rel_improvement = 0.0
            if abs(initial_loss_val) > 1e-12:  # avoid /0
                rel_improvement = (initial_loss_val - loss_val) / abs(initial_loss_val)

            global_step += 1
            print(
                f"Epoch[{epoch}/{epochs}] Step[{global_step}] "
                f"Loss={loss_val:.4f} GradNorm={grad_norm:.4f} "
                f"StepSize={step_size:.4f} RelImp={rel_improvement*100:.2f}%"
            )

            if do_wandb:
                import wandb
                wandb.log({
                    "train_loss": loss_val,
                    "grad_norm": grad_norm,
                    "step_size": step_size,
                    "relative_improvement": rel_improvement
                }, step=global_step)

        # Evaluate
        model.eval()
        correct = 0
        test_loss = 0.0
        with torch.no_grad():
            for data, targets in test_loader:
                data, targets = data.to(device), targets.to(device)
                out = model(data)
                l = criterion(out, targets)
                test_loss += l.item() * data.size(0)
                _, pred = out.max(1)
                correct += pred.eq(targets).sum().item()
        test_loss /= len(test_loader.dataset)
        accuracy = 100.*correct / len(test_loader.dataset)
        epoch_loss = total_loss / len(train_loader)

        print(f"Epoch {epoch+1}/{epochs} => TrainLoss={epoch_loss:.4f} TestLoss={test_loss:.4f} Acc={accuracy:.2f}%")

        if do_wandb:
            import wandb
            wandb.log({
                "epoch": epoch,
                "train_loss_epoch": epoch_loss,
                "test_loss": test_loss,
                "test_accuracy": accuracy
            }, step=global_step)

    if do_wandb:
        import wandb
        wandb.finish()

    return model

Set HF_TOKEN from colab secrets.
Set wandb from colab secrets.
hf_token: True
wandb_key: True
Using device: cuda


In [30]:
##################################################
# 6) Main
##################################################
def main(
    optimizer_name="gd",
    lr=1.0,
    epochs=2,
    batch_size=16,
    wandb_project="AAH-IA__newton-rhapson__",
):
    hf_token = os.environ.get("HF_TOKEN", None)
    wandb_key = os.environ.get("wandb", None)

    print("HF token:", bool(hf_token))
    print("W&B key:", bool(wandb_key))
    print(f"Using {optimizer_name} with ~9.6k param model. LR={lr}, epochs={epochs}, batch_size={batch_size}")

    model = train_model(
        optimizer_name=optimizer_name,
        learning_rate=lr,
        epochs=epochs,
        batch_size=batch_size,
        wandb_project=wandb_project
    )
    print("Done with main(). Returning model.")
    return model


# Example if we want to run immediately:
if __name__ == "__main__":
    model = main(
        optimizer_name="gd",
        lr=0.01, # 0.01 for gradient descent
        epochs=2,
        batch_size=16,
        wandb_project="AAH-IA__gradient-descent__"  # or AAH-IA__gradient-descent__
    )



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch[0/2] Step[2503] Loss=0.2428 GradNorm=2.0456 StepSize=0.0205 RelImp=89.43%
Epoch[0/2] Step[2504] Loss=0.4461 GradNorm=4.7528 StepSize=0.0475 RelImp=80.57%
Epoch[0/2] Step[2505] Loss=1.1773 GradNorm=7.4189 StepSize=0.0742 RelImp=48.74%
Epoch[0/2] Step[2506] Loss=0.2105 GradNorm=2.7422 StepSize=0.0274 RelImp=90.83%
Epoch[0/2] Step[2507] Loss=0.3602 GradNorm=2.0286 StepSize=0.0203 RelImp=84.32%
Epoch[0/2] Step[2508] Loss=0.3029 GradNorm=3.5179 StepSize=0.0352 RelImp=86.81%
Epoch[0/2] Step[2509] Loss=0.1977 GradNorm=2.4443 StepSize=0.0244 RelImp=91.39%
Epoch[0/2] Step[2510] Loss=0.3946 GradNorm=3.8210 StepSize=0.0382 RelImp=82.82%
Epoch[0/2] Step[2511] Loss=0.3696 GradNorm=3.1179 StepSize=0.0312 RelImp=83.91%
Epoch[0/2] Step[2512] Loss=0.1339 GradNorm=1.8289 StepSize=0.0183 RelImp=94.17%
Epoch[0/2] Step[2513] Loss=0.2462 GradNorm=2.4657 StepSize=0.0247 RelImp=89.28%
Epoch[0/2] Step[2514] Loss=0.1511 GradNorm=1.8542 StepS

0,1
epoch,▁█
grad_norm,▁▁▂▂▂▃▃▃▄▅█▄▆▅▄▆▃▇▆▄▃▄▄▆▂▅▃▅▆▅█▄█▃▂▂█▅▅▄
relative_improvement,▁▁▁▂▇█▆▅▇▇▇█▇▆█▆▆▇▇▇█▇▆▇█▆▇▆█▇█████▆█▆▇█
step_size,▂▄▆▆▄▆▄▄▅▅▃▅▅▆▄▂▅▄▇▁▃▆▄▅▃▄▄▃▇█▆▄▆▄▄▃▃█▄▃
test_accuracy,▁█
test_loss,█▁
train_loss,█▇▇▆▅▃▃▃▃▃▂▂▁▄▂▂▂▂▄▃▂▂▃▂▂▂▃▂▂▂▁▃▂▃▂▂▁▁▁▂
train_loss_epoch,█▁

0,1
epoch,1.0
grad_norm,1.9444
relative_improvement,0.92678
step_size,0.01944
test_accuracy,91.64
test_loss,0.29518
train_loss,0.16815
train_loss_epoch,0.33864


Done with main(). Returning model.


In [24]:
import wandb

wandb.finish()

0,1
train_loss,█▇██▇▆▇▅▅▅▄▄▄▄▃▂▂▂▂▂▂▂▂▂▁▁▂▃▂▂▂▂▂▁▁▂▂▃▂▂

0,1
train_loss,4.85807
