## Environment setup & installs

In [1]:
# pip install numpy==1.26.4

## Imports and helper installer

In [None]:
import sys, subprocess, importlib, math, time
from types import SimpleNamespace

def try_import(name, pypi_name=None):
    try:
        return importlib.import_module(name)
    except Exception:
        if pypi_name is None:
            pypi_name = name
        print(f"Trying to pip install {pypi_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", pypi_name])
        return importlib.import_module(name)


import torch
import torch.nn as nn
import torch.nn.functional as F

torchvision = None
imageio = None
try:
    torchvision = importlib.import_module("torchvision")
except Exception:
    try:
        torchvision = try_import("torchvision")
    except Exception:
        print("torchvision not available; will use torch-only fallback for saving.")

try:
    imageio = importlib.import_module("imageio")
except Exception:
    try:
        imageio = try_import("imageio")
    except Exception:
        print("imageio not available; fallback to saving tensors as .pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
torch.manual_seed(0)

Device: cuda


<torch._C.Generator at 0x7086849bd530>

## Positional encoding (like original NeRF)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, in_dims, num_freqs=10, log_scale=True):
        super().__init__()
        self.in_dims = in_dims
        self.num_freqs = num_freqs
        self.log_scale = log_scale
        if log_scale:
            self.freq_bands = 2.0 ** torch.arange(num_freqs).float()
        else:
            self.freq_bands = torch.linspace(1.0, 2.0 ** (num_freqs-1), num_freqs)
        self.output_dims = in_dims * num_freqs * 2

    def forward(self, x):
       
        out = []
        for freq in self.freq_bands.to(x.device):
            out.append(torch.sin(x * freq))
            out.append(torch.cos(x * freq))
        return torch.cat(out, dim=-1)

pe = PositionalEncoding(3, num_freqs=6)
t = torch.randn(2,3)
print("PE out shape:", pe(t).shape)

PE out shape: torch.Size([2, 36])


## NeRF MLP (small, fast)

In [None]:
class NeRFSmall(nn.Module):
    def __init__(self, pos_dim, dir_dim, d_hidden=128, n_layers=6, skips=[3]):
        super().__init__()
        self.skips = skips
        self.layers = nn.ModuleList()
        in_dim = pos_dim
        for i in range(n_layers):
            self.layers.append(nn.Linear(in_dim, d_hidden))
            in_dim = d_hidden
            if i in self.skips:
                in_dim += pos_dim
        # sigma head
        self.sigma_head = nn.Sequential(nn.Linear(d_hidden, d_hidden//2), nn.ReLU(), nn.Linear(d_hidden//2,1))
        # feature to RGB
        self.feature_layer = nn.Linear(d_hidden, d_hidden)
        self.dir_fc = nn.Sequential(nn.Linear(dir_dim + d_hidden, d_hidden//2), nn.ReLU(), nn.Linear(d_hidden//2, 3))

    def forward(self, x_pos_enc, x_dir_enc):
        x = x_pos_enc
        h = x
        for i, layer in enumerate(self.layers):
            h = F.relu(layer(h))
            if i in self.skips:
                h = torch.cat([h, x_pos_enc], dim=-1)
        sigma = F.relu(self.sigma_head(h)).squeeze(-1)  # (N,)
        feat = F.relu(self.feature_layer(h))
        # concat feat and dir encoding
        h_dir = torch.cat([feat, x_dir_enc], dim=-1)
        rgb = torch.sigmoid(self.dir_fc(h_dir))  # (N,3)
        return rgb, sigma

pe_pos = PositionalEncoding(3, num_freqs=10)   # yields 3*10*2 = 60
pe_dir = PositionalEncoding(3, num_freqs=4)    # yields 3*4*2 = 24
model = NeRFSmall(pos_dim=pe_pos.output_dims, dir_dim=pe_dir.output_dims).to(device)
print("Model param count:", sum(p.numel() for p in model.parameters()))

Model param count: 132868


## Rays, sample points, and volume rendering utilities

In [18]:
def get_rays_from_cam(origin, lookat, up, fov_deg, H, W):
    # origin, lookat, up: tensors (3,)
    # returns rays_o (H*W,3), rays_d (H*W,3)
    z = (lookat - origin); z = z / z.norm()
    x = torch.cross(z, up); x = x / x.norm()
    y = torch.cross(x, z); y = y / y.norm()
    # camera space sampling
    i = torch.linspace(0, W-1, W, device=origin.device)
    j = torch.linspace(0, H-1, H, device=origin.device)
    jj, ii = torch.meshgrid(j, i, indexing='ij')
    # normalized device coords
    aspect = W/H
    fov = math.radians(fov_deg)
    px = ( (ii + 0.5)/W - 0.5) * 2 * math.tan(fov/2) * aspect
    py = ( (jj + 0.5)/H - 0.5) * 2 * math.tan(fov/2)
    dirs = (px.unsqueeze(-1)*x + py.unsqueeze(-1)*y + z.unsqueeze(0).unsqueeze(0))
    dirs = dirs / torch.norm(dirs, dim=-1, keepdim=True)
    rays_o = origin.expand(H, W, 3).reshape(-1,3)
    rays_d = dirs.reshape(-1,3)
    return rays_o, rays_d

def sample_along_rays(rays_o, rays_d, near, far, n_samples, perturb=True):
    # rays_o,d: (N,3)
    N = rays_o.shape[0]
    t_vals = torch.linspace(0.0, 1.0, steps=n_samples, device=rays_o.device).unsqueeze(0).expand(N, -1)
    z_vals = near * (1 - t_vals) + far * t_vals  # linear depth
    if perturb:
        mids = 0.5 * (z_vals[:, :-1] + z_vals[:, 1:])
        upper = torch.cat([mids, z_vals[:, -1:]], -1)
        lower = torch.cat([z_vals[:, :1], mids], -1)
        t_rand = torch.rand(z_vals.shape, device=rays_o.device)
        z_vals = lower + (upper - lower) * t_rand
    pts = rays_o.unsqueeze(1) + rays_d.unsqueeze(1) * z_vals.unsqueeze(-1)
    return pts, z_vals

def volume_render_rgb(model, pts, dirs, z_vals, pe_pos, pe_dir):
    # pts: (N, S, 3); dirs: (N,3); z_vals: (N,S)
    N, S, _ = pts.shape
    pts_flat = pts.reshape(-1,3)
    dirs_exp = dirs.unsqueeze(1).expand(-1,S,-1).reshape(-1,3)
    pos_enc = pe_pos(pts_flat)
    dir_enc = pe_dir(dirs_exp)
    rgb, sigma = model(pos_enc, dir_enc)  # both (N*S, ...)
    rgb = rgb.reshape(N, S, 3)
    sigma = sigma.reshape(N, S)
    del pts_flat, dirs_exp, pos_enc, dir_enc
    dists = z_vals[...,1:] - z_vals[...,:-1]
    dists = torch.cat([dists, 1e10*torch.ones_like(dists[..., :1])], -1)  # (N,S)
    alpha = 1.0 - torch.exp(-sigma * dists)
    trans = torch.cumprod(torch.cat([torch.ones((N,1), device=alpha.device), 1.0 - alpha + 1e-10], -1), -1)[:, :-1]
    weights = alpha * trans  # (N,S)
    rgb_map = (weights.unsqueeze(-1) * rgb).sum(dim=1)  # (N,3)
    depth_map = (weights * z_vals).sum(dim=1)
    acc_map = weights.sum(dim=1)
    return rgb_map, depth_map, acc_map, weights

In [None]:
def gt_density_color(pts):
    c1 = torch.tensor([0.0, 0.0, 0.6], device=pts.device)
    c2 = torch.tensor([0.2, -0.2, 0.3], device=pts.device)
    s1 = 0.25
    s2 = 0.18
    d1 = torch.exp(-((pts - c1)**2).sum(-1)/(2*s1**2))
    d2 = torch.exp(-((pts - c2)**2).sum(-1)/(2*s2**2))
    sigma = 10.0 * (d1 + 0.7*d2)
    # color field: vary with position
    rgb = torch.stack([0.5 + 0.5*torch.tanh(pts[:,0]*3.0),
                       0.6 + 0.4*torch.tanh(pts[:,1]*3.0),
                       0.4 + 0.6*torch.tanh(pts[:,2]*3.0)], -1)
    # modulate color by relative blob strength
    blob = (d1.unsqueeze(-1) * torch.tensor([1.0,0.7,0.5], device=pts.device) +
            d2.unsqueeze(-1) * torch.tensor([0.4,0.8,1.0], device=pts.device))
    rgb = rgb * (0.4 + 0.8*blob)
    return rgb, sigma

def render_gt_images(origins, lookats, ups, fov, H, W, near, far, n_samples, pe_pos, pe_dir):
    # returns list of (rgb_images: torch (H*W,3) clipped [0,1])
    imgs = []
    depths = []
    for o, la, up in zip(origins, lookats, ups):
        rays_o, rays_d = get_rays_from_cam(o, la, up, fov, H, W)
        pts, z_vals = sample_along_rays(rays_o, rays_d, near, far, n_samples, perturb=False)
        N, S, _ = pts.shape
        pts_flat = pts.reshape(-1,3)
        rgb_field, sigma_field = gt_density_color(pts_flat)
        rgb_field = rgb_field.reshape(N, S, 3)
        sigma_field = sigma_field.reshape(N, S)
        # do volume rendering (same formula)
        dists = z_vals[...,1:] - z_vals[...,:-1]
        dists = torch.cat([dists, 1e10*torch.ones_like(dists[..., :1])], -1)
        alpha = 1.0 - torch.exp(-sigma_field * dists)
        trans = torch.cumprod(torch.cat([torch.ones((N,1), device=alpha.device), 1.0 - alpha + 1e-10], -1), -1)[:, :-1]
        weights = alpha * trans
        rgb_map = (weights.unsqueeze(-1) * rgb_field).sum(dim=1)
        depth_map = (weights * z_vals).sum(dim=1)
        imgs.append(rgb_map)
        depths.append(depth_map)
    return imgs, depths

# Quick generate 64 training views around scene
def generate_cameras(n_views=64, radius=1.2, device=device):
    origins = []
    lookats = []
    ups = []
    for i in range(n_views):
        theta = 2*math.pi * i / n_views
        phi = math.radians(10)  # slight elevation
        x = radius * math.cos(theta) * math.cos(phi)
        y = radius * math.sin(theta) * math.cos(phi)
        z = radius * math.sin(phi)
        origin = torch.tensor([x,y,z], device=device)
        lookat = torch.tensor([0.0,0.0,0.3], device=device)
        up = torch.tensor([0.0,0.0,1.0], device=device)
        origins.append(origin)
        lookats.append(lookat)
        ups.append(up)
    return origins, lookats, ups

# test GT render of 4 views (low res)
H, W = 64, 64
origins, lookats, ups = generate_cameras(8)
imgs, depths = render_gt_images(origins[:4], lookats[:4], ups[:4], fov=40, H=H, W=W, near=0.1, far=2.0, n_samples=64, pe_pos=pe, pe_dir=pe)
print("GT imgs rendered:", len(imgs), imgs[0].shape)

GT imgs rendered: 4 torch.Size([4096, 3])


## Dataset returning ray bundles for training

In [None]:
from torch.utils.data import Dataset, DataLoader

class RaysDataset(Dataset):
    def __init__(self, imgs, depths, origins, lookats, ups, H, W, fov, near, far, n_samples):
        # imgs: list of (H*W,3) tensors already rendered
        self.H = H; self.W = W; self.fov = fov
        self.near = near; self.far = far; self.n_samples = n_samples
        self.origins = origins; self.lookats = lookats; self.ups = ups
        self.imgs = imgs  # list of tensors (H*W,3)
        self.views = len(imgs)
        # create ray sets cached per view
        self.rays_o = []
        self.rays_d = []
        for o, la, up in zip(origins, lookats, ups):
            ro, rd = get_rays_from_cam(o, la, up, fov, H, W)
            self.rays_o.append(ro)
            self.rays_d.append(rd)
        # flatten all views into one big list
        self.view_indices = []
        for v in range(self.views):
            self.view_indices += [v] * (H*W)

    def __len__(self):
        return len(self.view_indices)

    def __getitem__(self, idx):
        v = self.view_indices[idx]
        pix_idx = idx % (self.H * self.W)
        ro = self.rays_o[v][pix_idx]
        rd = self.rays_d[v][pix_idx]
        pixel = self.imgs[v][pix_idx]
        return {'rays_o': ro, 'rays_d': rd, 'pixel': pixel}

# collate to batch rays
def collate_rays(batch):
    rays_o = torch.stack([b['rays_o'] for b in batch], dim=0)
    rays_d = torch.stack([b['rays_d'] for b in batch], dim=0)
    pixels = torch.stack([b['pixel'] for b in batch], dim=0)
    return {'rays_o': rays_o, 'rays_d': rays_d, 'pixels': pixels}

# Build dataset from previously rendered GT images
train_views = len(imgs)  # from previous GT generation; you can generate more (e.g., 64)
dataset = RaysDataset(imgs, depths, origins[:train_views], lookats[:train_views], ups[:train_views], H=H, W=W, fov=40, near=0.1, far=2.0, n_samples=64)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True, collate_fn=collate_rays, num_workers=0)
print("Dataset size:", len(dataset))

Dataset size: 16384


## Precompute rays for ALL training views

In [None]:
rays_o_all = []
rays_d_all = []

for o, la, up in zip(origins, lookats, ups):
    ro, rd = get_rays_from_cam(o, la, up, 40, H, W)
    rays_o_all.append(ro.to(device))  # store as torch tensors
    rays_d_all.append(rd.to(device))

print("Precomputed rays for:", len(rays_o_all), "views.")
print("Each rays tensor shape:", rays_o_all[0].shape, rays_d_all[0].shape)

Precomputed rays for: 8 views.
Each rays tensor shape: torch.Size([4096, 3]) torch.Size([4096, 3])


## Training Loop

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.MSELoss()

model.train()
loss_log = []

num_views = len(imgs)  

for it in range(1, 4001):   # 4000 iterations

    view_idx = torch.randint(low=0, high=num_views, size=(1,)).item()

 
    gt_img = imgs[view_idx].to(device)          # (H*W, 3)
    ro = rays_o_all[view_idx].to(device)        # rays_o_all: precomputed rays
    rd = rays_d_all[view_idx].to(device)

   
    idx = torch.randint(low=0, high=H*W, size=(1024,), device=device)

    rays_o = ro[idx]        # (1024,3)
    rays_d = rd[idx]        # (1024,3)
    target = gt_img[idx]    # (1024,3)

  
    pts, z_vals = sample_along_rays(rays_o, rays_d, near=0.1, far=2.0, n_samples=64, perturb=True)
    rgb_pred, depth_pred, acc, weights = volume_render_rgb(model, pts, rays_d, z_vals, pe_pos, pe_dir)

    loss = criterion(rgb_pred, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss_log.append(loss.item())

    if it % 200 == 0:
        print(f"[Iter {it}] Loss = {loss.item():.6f}")

torch.save(model.state_dict(), "outputs_task4/nerf_small.pth")
print("Training completed. Saved to nerf_small.pth.")

[Iter 200] Loss = 0.005196
[Iter 400] Loss = 0.000717
[Iter 600] Loss = 0.000394
[Iter 800] Loss = 0.001225
[Iter 1000] Loss = 0.000163
[Iter 1200] Loss = 0.000101
[Iter 1400] Loss = 0.000164
[Iter 1600] Loss = 0.000095
[Iter 1800] Loss = 0.000053
[Iter 2000] Loss = 0.000069
[Iter 2200] Loss = 0.000248
[Iter 2400] Loss = 0.000341
[Iter 2600] Loss = 0.000101
[Iter 2800] Loss = 0.000025
[Iter 3000] Loss = 0.000033
[Iter 3200] Loss = 0.000991
[Iter 3400] Loss = 0.000020
[Iter 3600] Loss = 0.000016
[Iter 3800] Loss = 0.000013
[Iter 4000] Loss = 0.000009
Training completed. Saved to nerf_small.pth.


## Rendering novel views and compute PSNR/SSIM and a LPIPS-like perceptual distance (VGG features)

In [None]:
def render_novel_view(model, origin, lookat, up, fov, H, W, near, far, n_samples, pe_pos, pe_dir, chunk=4096):
    model.eval()
    with torch.no_grad():
        rays_o, rays_d = get_rays_from_cam(origin, lookat, up, fov, H, W)
        N = rays_o.shape[0]
        all_rgb = []
        for i in range(0, N, chunk):
            ro = rays_o[i:i+chunk].to(device)
            rd = rays_d[i:i+chunk].to(device)
            pts, z_vals = sample_along_rays(ro, rd, near, far, n_samples, perturb=False)
            rgb_map, depth_map, acc, weights = volume_render_rgb(model, pts, rd, z_vals, pe_pos, pe_dir)
            all_rgb.append(rgb_map)
        rgb = torch.cat(all_rgb, dim=0)
    return rgb  # (H*W,3)


def psnr_torch(img1, img2, max_val=1.0):
    mse = torch.mean((img1 - img2)**2)
    return -10.0 * torch.log10(mse + 1e-12)


def ssim_torch(img1, img2, C1=0.01**2, C2=0.03**2, win_size=7):
    # img: (N,3) flatten. We'll reshape to (1,1,H,W) for each channel and average SSIM.
    def _ssim_channel(a,b):
        # a,b: (1,1,H,W)
        mu1 = F.avg_pool2d(a, kernel_size=win_size, stride=1, padding=win_size//2)
        mu2 = F.avg_pool2d(b, kernel_size=win_size, stride=1, padding=win_size//2)
        mu1_sq = mu1*mu1
        mu2_sq = mu2*mu2
        mu1_mu2 = mu1*mu2
        sigma1_sq = F.avg_pool2d(a*a, win_size, stride=1, padding=win_size//2) - mu1_sq
        sigma2_sq = F.avg_pool2d(b*b, win_size, stride=1, padding=win_size//2) - mu2_sq
        sigma12 = F.avg_pool2d(a*b, win_size, stride=1, padding=win_size//2) - mu1_mu2
        ssim_map = ((2*mu1_mu2 + C1)*(2*sigma12 + C2))/((mu1_sq + mu2_sq + C1)*(sigma1_sq + sigma2_sq + C2) + 1e-12)
        return ssim_map.mean()
    Hc = int(math.sqrt(img1.shape[0]))
    a = img1.reshape(Hc, Hc, 3).permute(2,0,1).unsqueeze(1)  # (3,1,H,W)
    b = img2.reshape(Hc, Hc, 3).permute(2,0,1).unsqueeze(1)
    ssim_vals = []
    for ch in range(3):
        ssim_vals.append(_ssim_channel(a[ch:ch+1], b[ch:ch+1]).item())
    return sum(ssim_vals)/3.0

# LPIPS-like — use VGG feature L2 distance normalized (fallback if torchvision unavailable)
vgg = None
if torchvision is not None:
    try:
        vgg = torchvision.models.vgg16(pretrained=True).features.eval().to(device)
        for p in vgg.parameters(): p.requires_grad = False
    except Exception as e:
        print("Could not load vgg16:", e)
        vgg = None

def lpips_like(img1, img2):
    # img1,img2: (N,3) flatten [0,1]. Convert to NCHW and pass through VGG layers and compute L2 of features
    if vgg is None:
        # fallback: simple L2 normalized distance
        return torch.mean((img1 - img2).pow(2)).item()
    Hc = int(math.sqrt(img1.shape[0]))
    a = img1.reshape(Hc, Hc, 3).permute(2,0,1).unsqueeze(0)  # 1,3,H,W
    b = img2.reshape(Hc, Hc, 3).permute(2,0,1).unsqueeze(0)
    # normalize to ImageNet mean/std
    mean = torch.tensor([0.485, 0.456, 0.406], device=device).view(1,3,1,1)
    std = torch.tensor([0.229, 0.224, 0.225], device=device).view(1,3,1,1)
    a = (a.to(device) - mean) / std
    b = (b.to(device) - mean) / std
    feats_a = vgg(a)
    feats_b = vgg(b)
    return torch.mean((feats_a - feats_b).pow(2)).item()

# Render a novel held-out view and compute metrics vs GT (we have analytic GT)
novel_idx = 0
# prepare more GT views for training above; here we re-use GT for eval (but normally you'd hold out)
eval_origin = origins[0]; eval_lookat = lookats[0]; eval_up = ups[0]
rendered = render_novel_view(model, eval_origin, eval_lookat, eval_up, fov=40, H=H, W=W, near=0.1, far=2.0, n_samples=128, pe_pos=pe_pos, pe_dir=pe_dir)
gt = imgs[0].to(device)  # from GT earlier
psnr_val = psnr_torch(rendered, gt)
ssim_val = ssim_torch(rendered.cpu(), gt.cpu())
lpips_val = lpips_like(rendered, gt)
print(f"PSNR: {psnr_val.item():.3f} dB, SSIM: {ssim_val:.4f}, LPIPS-like: {lpips_val:.6f}")



PSNR: 49.135 dB, SSIM: 0.9967, LPIPS-like: 0.022604


## Save novel view frames as PNG

In [None]:
import os
def save_image_tensor(img_tensor, path):
    """
    img_tensor: (H*W, 3) in [0,1], torch float32
    Saves as PNG using torchvision (safe, no numpy needed)
    """
    Hc = int(math.sqrt(img_tensor.shape[0]))
    img = img_tensor.reshape(Hc, Hc, 3).permute(2, 0, 1)  # CHW for torchvision

    # clamp & convert
    img = torch.clamp(img, 0.0, 1.0)

    try:
        torchvision.utils.save_image(img, path)  # Saves .png
        return True
    except Exception as e:
        print("PNG save failed:", e)
        torch.save(img_tensor.cpu(), path + ".pt")  # fallback
        return False


frames = []
n_frames = 32
origins_eval, lookats_eval, ups_eval = generate_cameras(n_frames)

os.makedirs("novel_views_png", exist_ok=True)

for i in range(n_frames):
    r = render_novel_view(
        model,
        origins_eval[i],
        lookats_eval[i],
        ups_eval[i],
        40, H, W,
        0.1, 2.0,
        128,
        pe_pos, pe_dir
    )
    frames.append(r.cpu())

    save_path = f"novel_views_png/view_{i:03d}.png"
    save_image_tensor(r.cpu(), save_path)
    print("Saved:", save_path)

print("All frames saved to folder: novel_views_png")
print("NOTE: Create video locally by stitching PNGs together.")

Saved: novel_views_png/view_000.png
Saved: novel_views_png/view_001.png
Saved: novel_views_png/view_002.png
Saved: novel_views_png/view_003.png
Saved: novel_views_png/view_004.png
Saved: novel_views_png/view_005.png
Saved: novel_views_png/view_006.png
Saved: novel_views_png/view_007.png
Saved: novel_views_png/view_008.png
Saved: novel_views_png/view_009.png
Saved: novel_views_png/view_010.png
Saved: novel_views_png/view_011.png
Saved: novel_views_png/view_012.png
Saved: novel_views_png/view_013.png
Saved: novel_views_png/view_014.png
Saved: novel_views_png/view_015.png
Saved: novel_views_png/view_016.png
Saved: novel_views_png/view_017.png
Saved: novel_views_png/view_018.png
Saved: novel_views_png/view_019.png
Saved: novel_views_png/view_020.png
Saved: novel_views_png/view_021.png
Saved: novel_views_png/view_022.png
Saved: novel_views_png/view_023.png
Saved: novel_views_png/view_024.png
Saved: novel_views_png/view_025.png
Saved: novel_views_png/view_026.png
Saved: novel_views_png/view_

## Report

Neural Radiance Fields (NeRF) provide a compact, differentiable representation of 3D appearance by mapping
continuous 3D coordinates and viewing directions to emitted color and volumetric density using a multi-layer
perceptron (MLP). This approach implicitly encodes geometry—rather than storing explicit surfaces—so geometry
emerges from volumetric density predicted across sampled points along camera rays.

This simplified implementation builds an MLP that receives positional encoded 3D coordinates and view
direction encodings. Positional encoding maps low-dimensional inputs into a higher-frequency space using
sine/cosine bases, enabling the network to represent high-frequency variation (sharp edges, fine geometry)
despite using smooth activation functions. The MLP uses a density head (sigma) and a color head; the density
controls opacity along sampled ray segments while the color head conditions on both spatial features and
view direction to reproduce view-dependent effects (specular highlights).

Volume rendering integrates density-weighted colors along a ray: points are sampled between near and far
bounds; network outputs color and density; weights are computed from density and distances between samples;
and the weighted sum yields the pixel color and depth estimate. Training supervises rendered colors against
ground-truth images for many camera poses. The optimization drives the MLP to assign high density where
rays consistently intersect an object's surface thus implicitly reconstructing the geometry, while view-dependent
color allows realistic appearance variations.

Trade-offs: sample count, model width, and training views govern the speed-vs-quality trade-off. More samples
and larger networks improve fidelity but increase compute and memory. Positional encoding frequency boosts
detail but can lead to sneaking high-frequency artifacts or slower convergence. Practical speedups include
coarse-to-fine sampling strategies, hierarchical sampling, smaller inference resolutions, caching learned
features, or using smaller but efficient architectures (e.g., instant-ngp hash encodings or voxel-grid
priors) for real-time rendering. For quality, increasing training views and iterations yields better geometry
and view synthesis.

In this assignment, we used a synthetic sphere dataset to validate the whole pipeline quickly. Despite the
dataset's simplicity compared with complex natural scenes, the model demonstrates how continuous volumetric
representations capture both geometry (density) and appearance (view-conditioned color). The provided code
is modular — replace the synthetic dataset with Blender renders and poses for real scenes. Quantitative metrics
(PSNR/SSIM and LPIPS if available) and saved renders are provided to evaluate and compare different speed/
quality configurations.
