In [104]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [105]:
class LinearRegressionScratch:
    """Linear regression implemented from scratch using PyTorch tensors.
    Usage:
        model = LinearRegressionScratch(num_inputs)
        y_hat = model.forward(X)
        loss = model.loss(y_hat, y)
    """
    def __init__(self, num_inputs, device=None, sigma=0.01):
        self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
        self.num_inputs = int(num_inputs)
        # weights: shape (num_inputs, 1)
        self.w = torch.normal(0.0, sigma, (self.num_inputs, 1), requires_grad=True, device=self.device)
        # bias: shape (1,)
        self.b = torch.zeros(1, requires_grad=True, device=self.device)

    def forward(self, X):
        """Compute predictions.
        X: tensor of shape (batch_size, num_inputs)
        returns: tensor of shape (batch_size, 1)
        """
        if X.ndim != 2 or X.shape[1] != self.num_inputs:
            raise ValueError(f"X must be shaped (batch_size, {self.num_inputs}), got {X.shape}")
        return X.matmul(self.w) + self.b

    def loss(self, y_hat, y):
        """Mean squared error (1/2 factor) averaged over batch.
        y_hat: (batch_size, 1)
        y: (batch_size, 1) or (batch_size,)
        """
        if y.ndim == 1:
            y = y.view(-1, 1)
        l = (y_hat - y) ** 2 / 2
        return l.mean()

In [106]:
# This cell is no longer needed as the forward method is now inside the class definition.

# Defining the Loss Function

In [107]:
# This cell is no longer needed as the loss method is now inside the class definition.

#  Defining the Optimization Algorithm

In [108]:
class SGD:
    """Simple minibatch stochastic gradient descent optimizer.


    Example:
    opt = SGD([model.w, model.b], lr=0.1)
    loss.backward()
    opt.step()
    opt.zero_grad()
    """


    def __init__(self, params, lr):
        self.params = list(params)
        self.lr = float(lr)


    def step(self):
        """Update parameters in-place without tracking the update in autograd."""
        with torch.no_grad():
            for p in self.params:
                if p.grad is None:
                    continue
                # use p.data add_ or torch.no_grad() to avoid autograd tracking
                p.data.add_(-self.lr * p.grad)


    def zero_grad(self):
        """Zero out gradients for all parameters (if present)."""
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()

In [109]:
def make_synthetic_data(w_true, b_true, num_samples=1000000, noise_std=0.1, device=None):
    device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
    num_inputs = w_true.shape[0]
    X = torch.randn(num_samples, num_inputs, device=device)
    y = X.matmul(w_true.view(-1, 1)) + b_true + noise_std * torch.randn(num_samples, 1, device=device)
    return X, y

In [112]:
if __name__ == "__main__":
    # hyperparameters
    num_inputs = 3
    true_w = torch.tensor([2.0, -1.0, 0.5])
    true_b = torch.tensor([0.7])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X, y = make_synthetic_data(true_w, true_b, num_samples=1024, noise_std=0.1, device=device)
    # create dataset and dataloader for minibatches
    ds = TensorDataset(X, y)
    batch_size = 64
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True)
    # initialize model
    model = LinearRegressionScratch(num_inputs=num_inputs, device=device, sigma=0.01)
    # choose optimizer: custom SGD or PyTorch's
    use_custom_optimizer = True
    if use_custom_optimizer:
        opt = SGD([model.w, model.b], lr=0.9)
    # training loop
    n_epochs = 100
    for epoch in range(n_epochs):
        epoch_loss = 0.0
        for X_batch, y_batch in dl:
            # forward
            y_hat = model.forward(X_batch)
            loss = model.loss(y_hat, y_batch)
            # backward
            loss.backward()
            # step
            if use_custom_optimizer:
                opt.step()
                opt.zero_grad()
            else:
                opt.step()
                opt.zero_grad()
            epoch_loss += loss.item() * X_batch.size(0)
        epoch_loss /= len(ds)
        if (epoch + 1) % 50 == 0 or epoch == 0:
            print(f"Epoch {epoch+1:3d}/{n_epochs} - loss: {epoch_loss:.6f}")
    print("\nTraining finished.")
    print("True w:", true_w.tolist(), "True b:", true_b.item())
    print("Learned w:", model.w.detach().squeeze().tolist(), "Learned b:", model.b.item())
    # final prediction check
    with torch.no_grad():
        sample_X = torch.tensor([[1.0, 2.0, -1.0]], device=device)
        pred = model.forward(sample_X)
        print("\nSample input:", sample_X.cpu().numpy())
        print("Predicted value:", pred.cpu().numpy())

Epoch   1/100 - loss: 0.164029
Epoch  50/100 - loss: 0.005524
Epoch 100/100 - loss: 0.005420

Training finished.
True w: [2.0, -1.0, 0.5] True b: 0.699999988079071
Learned w: [1.9995328187942505, -0.9930555820465088, 0.49830710887908936] Learned b: 0.693470299243927

Sample input: [[ 1.  2. -1.]]
Predicted value: [[0.20858485]]
