In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(0)

# simple linear model for y = 2x + 1
class SimpleLinear(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Linear(1, 1)  # no hidden layer, no ReLU

    def forward(self, x):
        return self.lin(x) # linear Map. X()

model = SimpleLinear()
optimizer = optim.SGD(model.parameters(), lr=1e-2)  # small LR
loss_fn = nn.MSELoss()

# data
x_train = torch.randn(100, 1) * 10.0
y_train = 2.0 * x_train + 1.0 + torch.randn_like(x_train) * 0.5

# training
for epoch in range(200):
    optimizer.zero_grad() # zero accumulated gradient.
    pred = model(x_train)
    loss = loss_fn(pred, y_train)
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}: Loss={loss.item():.4f}")

# quick check
x_test = torch.tensor([[5.0]])
pred = model(x_test).item()
print(f"Input=5, Pred={pred:.3f}, True={2*5+1}")

Epoch 0: Loss=398.6652
Epoch 20: Loss=85.7614
Epoch 40: Loss=18.6274
Epoch 60: Loss=4.2167
Epoch 80: Loss=1.1203
Epoch 100: Loss=0.4534
Epoch 120: Loss=0.3092
Epoch 140: Loss=0.2777
Epoch 160: Loss=0.2707
Epoch 180: Loss=0.2691
Input=5, Pred=11.026, True=11


In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

torch.manual_seed(42)
device = ("mps" if torch.mps.is_available() else "cpu")

# MLP with 2 hidden layers and ReLU activation
class MLP(nn.Module):
    def __init__(self, in_dim=1, h1=64, h2=32, out_dim=1):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, out_dim)

        # simple initialization (helps stable training)
        nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
        nn.init.uniform_(self.fc3.weight, -0.1, 0.1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x
    
# generate synthetic linear-ish data with noise (same ground truth: y=2x+1)
N = 2000
x = torch.randn(N, 1) * 10.0
y = 2.0 * x + 1.0 + torch.rand_like(x) * 0.5

# dataset + dataloader for mini-batch SGD
batch_size = 64
dataset = TensorDataset(x, y)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = MLP().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# training : mini-batch gradient descent approximates full gradient using batches
epochs = 100
for epoch in range(epochs):
    epoch_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad() # clear gradients from previous batch
        yhat = model(xb) # forward compute predictions
        loss = loss_fn(yhat, yb) # compute batch loss
        loss.backward() # backprop: compute gradienst via autograd
        optimizer.step() # update params 
        epoch_loss += loss.item() * xb.size(0)
    epoch_loss /= N
    if epoch % 10 == 0:
        print(f"Epoch {epoch: 03d}, Loss={epoch_loss:.4f}, Device={device}")

# evaluation on a single example
x_test = torch.tensor([[5.0]]).to(device)
with torch.no_grad():
    pred = model(x_test).cpu().item()
print(f"Input=5, Pred={pred:.3f}, true={2*5+1}")


Epoch  00, Loss=242.8731, Device=mps
Epoch  10, Loss=0.0248, Device=mps
Epoch  20, Loss=0.0233, Device=mps
Epoch  30, Loss=0.0221, Device=mps
Epoch  40, Loss=0.0238, Device=mps
Epoch  50, Loss=0.0234, Device=mps
Epoch  60, Loss=0.0238, Device=mps
Epoch  70, Loss=0.0227, Device=mps
Epoch  80, Loss=0.0228, Device=mps
Epoch  90, Loss=0.0227, Device=mps
Input=5, Pred=11.260, true=11
