Feature extraction and prediction from a single image using EfficientNetV2 and NLF head.

In [6]:
import os, math, sys
import torch
from PIL import Image
import torchvision.transforms as T

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)


Device: cpu


In [7]:
# Ensure DATA_ROOT and PROJDIR are set before importing nlf
import os
# Prefer a local `data/` folder inside the repo if it exists, otherwise use a sensible default
repo_data = os.path.abspath('data')
if os.path.exists(repo_data) and os.listdir(repo_data):
    os.environ.setdefault('DATA_ROOT', repo_data)
else:
    os.environ.setdefault('DATA_ROOT', os.path.expanduser('~/data/posepile'))
# Set PROJDIR relative to DATA_ROOT if not already set
os.environ.setdefault('PROJDIR', f"{os.environ['DATA_ROOT']}/projects/localizerfields")
print('DATA_ROOT =', os.environ['DATA_ROOT'])
print('PROJDIR   =', os.environ['PROJDIR'])

DATA_ROOT = /Users/lemon/Documents/TUD/Thesis/Code/nlf/data
PROJDIR   = /Users/lemon/Documents/TUD/Thesis/Code/nlf/data/projects/localizerfields


In [9]:
# Initialize FLAGS with EfficientNetV2-S backbone and reasonable defaults
from nlf.pt import init as nlf_init
nlf_init.initialize([
    '--backbone', 'efficientnetv2-s',
    '--proc-side', '256',
    '--stride-test', '32',
    '--stride-train', '32',
    '--no-batch-renorm'  # use plain BatchNorm2d to keep things simple here
])
from simplepyutils import FLAGS
print('Backbone flag:', FLAGS.backbone, 'proc-side:', FLAGS.proc_side)


Backbone flag: efficientnetv2-s proc-side: 256


In [12]:
# Build EfficientNetV2 backbone without downloading pretrained weights
import torch.nn as nn
from nlf.pt.backbones import efficientnet as effnet
from nlf.pt.backbones.builder import get_normalizer

bn = get_normalizer()  # normalization layer factory configured by FLAGS
# weights=None avoids any network download
bbone_raw = effnet.efficientnet_v2_s(norm_layer=bn, weights=None)
backbone = bbone_raw.features.to(device).eval()

# Preprocessing layer used by the repo for EfficientNetV2 (mean=std=0.5)
class Preproc(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer('mean', torch.tensor([0.5,0.5,0.5]).view(3,1,1))
        self.register_buffer('std', torch.tensor([0.5,0.5,0.5]).view(3,1,1))
    def forward(self, x):
        return (x - self.mean.to(x.dtype)) / self.std.to(x.dtype)

preproc = Preproc().to(device)
print('Backbone ready')


Backbone ready


In [13]:
# Load one image from data/ (or fall back to example_image.jpg)
img_path = 'data/dance.jpg'
if not os.path.exists(img_path):
    img_path = 'example_image.jpg'
print('Using image:', img_path)
img = Image.open(img_path).convert('RGB')
transform = T.Compose([T.Resize((FLAGS.proc_side, FLAGS.proc_side)), T.ToTensor()])
x = transform(img).unsqueeze(0).to(device)  # [1,3,H,W] in [0,1]

with torch.no_grad():
    x_in = preproc(x)
    feat = backbone(x_in)
print('Feature tensor:', tuple(feat.shape))


Using image: data/dance.jpg
Feature tensor: (1, 1280, 8, 8)


Predicting with NLF head from features.
If the project assets under PROJDIR are available on this machine (joint info and canonical files),
the next cell will also compute 2D/3D predictions from the extracted feature map.

In [16]:
# Reuse the pretrained TorchScript detector submodule to get person boxes, then run the NLF head on crops
# Steps:
# 1) load scripted multiperson model and reuse its `detector` attribute
# 2) get boxes for the notebook image
# 3) for each box crop the original PIL image, resize to FLAGS.proc_side and compute features with your local backbone
# 4) run the NLF head on each crop's feature map and print shapes

from pathlib import Path
try:
    ts_path = Path('models/nlf_l_multi.torchscript')
    if not ts_path.exists():
        raise FileNotFoundError(f"TorchScript model not found at {ts_path}")

    ts = torch.jit.load(str(ts_path), map_location=device)
    print('Loaded scripted multiperson model from', ts_path)

    # Reuse detector submodule
    if not hasattr(ts, 'detector'):
        raise AttributeError('Loaded scripted model has no `detector` attribute')
    detector = ts.detector
    print('Reusing detector submodule from the scripted model')

    # Read image as uint8 tensor for the detector (the detector expects padded/resized image internally)
    import torchvision
    img_tensor_u8 = torchvision.io.read_image(img_path).to(device)  # [3,H,W], uint8
    img_batch = img_tensor_u8.unsqueeze(0)

    # Call the detector (returns list-of-tensors per image)
    boxes_list = detector(img_batch, threshold=0.3, nms_iou_threshold=0.7, max_detections=50)
    boxes = boxes_list[0] if len(boxes_list) > 0 else torch.empty((0, 5), device=device)
    print('Detector returned', boxes.shape[0], 'boxes')

    if boxes.shape[0] == 0:
        print('No detections; nothing to crop/predict')
    else:
        # Build weight field + NLFModel once and reuse for all crops
        from nlf.pt.models import field as lf_field
        from nlf.pt.models.nlf_model import NLFModel
        weight_field = lf_field.build_field().to(device).eval()
        model = NLFModel(nn.Sequential(preproc, backbone), weight_field, get_normalizer(), 1280).to(device).eval()
        canonical = model.canonical_locs().to(device)

        results = []
        for i in range(boxes.shape[0]):
            b = boxes[i].cpu().numpy()  # [cx, cy, w, h, score]
            cx, cy, w_box, h_box, score = b.tolist()
            left = max(0, int(round(cx - w_box / 2)))
            top = max(0, int(round(cy - h_box / 2)))
            right = min(img.width, int(round(left + w_box)))
            bottom = min(img.height, int(round(top + h_box)))
            if right <= left or bottom <= top:
                print(f'Invalid box {i}, skipping')
                continue

            # Crop and resize using PIL to the backbone input size
            crop_pil = img.crop((left, top, right, bottom)).resize((FLAGS.proc_side, FLAGS.proc_side))
            x_crop = transform(crop_pil).unsqueeze(0).to(device)  # [1,3,H,W] in [0,1]

            with torch.no_grad():
                x_in = preproc(x_crop)
                feat_crop = backbone(x_in)  # feature map for this crop
                # run heatmap head on crop feature
                coords2d_img, coords3d_rel, uncert = model.heatmap_head.predict_same_canonicals(feat_crop, canonical)
                W = H = FLAGS.proc_side
                f = 0.5 * W / math.tan(math.radians(55.0 / 2))
                K = torch.tensor([[f, 0, W / 2], [0, f, H / 2], [0, 0, 1]], dtype=torch.float32, device=device)[None]
                coords3d_abs, uncert_out = model.heatmap_head.reconstruct_absolute(coords2d_img, coords3d_rel, uncert, K)

            print(f'Box {i}: score={score:.3f}, crop_pixel_box=[{left},{top},{right},{bottom}], coords3d_abs:', tuple(coords3d_abs.shape))
            results.append(dict(box=[left, top, right, bottom], score=float(score), coords3d_abs=coords3d_abs, coords2d_img=coords2d_img))

    # expose results to the notebook scope
    detector_boxes = boxes
    detector_results = results if 'results' in locals() else []

except Exception as e:
    print('Detector-based crop/predict skipped:', repr(e))

Loaded scripted multiperson model from models/nlf_l_multi.torchscript
Reusing detector submodule from the scripted model
Detector returned 1 boxes
Detector-based crop/predict skipped: FileNotFoundError(2, 'No such file or directory')
Detector returned 1 boxes
Detector-based crop/predict skipped: FileNotFoundError(2, 'No such file or directory')


In [17]:
# Fit SMPLX using the scripted multiperson model and detector boxes
# This reuses the multiperson fitting pipeline (crop_model + fitter) embedded in the TorchScript model.
from pathlib import Path
try:
    ts_path = Path('models/nlf_l_multi.torchscript')
    if not ts_path.exists():
        raise FileNotFoundError(f"TorchScript model not found at {ts_path}")

    # load scripted model if not already loaded
    if 'ts' not in globals():
        ts = torch.jit.load(str(ts_path), map_location=device)
    print('Using scripted model at', ts_path)

    # Ensure detector_boxes is available
    if 'detector_boxes' not in globals():
        raise RuntimeError('detector_boxes not found — run the detector cell first')

    boxes = detector_boxes
    # scripted API expects a list of boxes per image
    boxes_list = [boxes]

    # Read image tensor (uint8) for the estimator
    import torchvision
    img_tensor_u8 = torchvision.io.read_image(img_path).to(device)  # [3,H,W]
    img_batch = img_tensor_u8.unsqueeze(0)

    # Call the estimator that fits parametric model using provided boxes
    # There are two exported variants; try the parametric estimator with smplx
    try:
        result = ts.estimate_parametric_batched(img_batch, boxes_list, model_name='smplx')
    except Exception:
        # some scripted exports may use estimate_smpl_batched alias
        result = ts.estimate_smpl_batched(img_batch, boxes_list, model_name='smplx')

    print('Fitting result keys:', list(result.keys()))
    # Print common outputs and shapes for the first detected person per image
    def shape_of(x):
        try:
            if isinstance(x, (list, tuple)):
                return tuple(x[0].shape) if len(x) > 0 else (0,)
            return tuple(x.shape)
        except Exception:
            return str(type(x))

    for k in ['pose','betas','trans','vertices3d','joints3d','vertices2d','joints2d']:
        if k in result:
            print(k, '->', shape_of(result[k]))

    # Expose fitting result in notebook scope
    smplx_fit_result = result

except Exception as e:
    print('SMPLX fitting skipped:', repr(e))

Using scripted model at models/nlf_l_multi.torchscript
SMPLX fitting skipped: RuntimeError('detector_boxes not found — run the detector cell first')
