In [38]:
import os, glob, argparse, math, itertools
import torch, torch.nn.functional as F
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from compressai.zoo import bmshj2018_factorized, ssf2020

In [25]:
mot17_root = "BoostTrack/data/MOT17/test"
sequence = "MOT17-01-DPM"
img_folder = os.path.join(mot17_root, sequence, 'img1')
output_folder = os.path.join('BoostTrack/data/MOT17-compressed', sequence)
os.makedirs(output_folder, exist_ok=True)

In [44]:
def pad_to_multiple(x, m):
    """
    Reflect-pad so (H, W) is a multiple of m.
    SSF / other video codecs need m = 128; most image codecs work with 64.
    """
    B, C, H, W = x.shape
    Hp, Wp = (m - H % m) % m, (m - W % m) % m
    return F.pad(x, (0, Wp, 0, Hp), mode="reflect"), (H, W)

def bits_in(strings):
    return sum(len(s) * 8 for s in flatten(strings))

def flatten(l):
    for el in l:
        if isinstance(el, (list, tuple)):
            yield from flatten(el)
        else:
            yield el

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
quality = 6
model = ssf2020(quality=quality, metric='mse', pretrained=True).to(device) 
model.eval()

Downloading: "https://compressai.s3.amazonaws.com/models/v1/ssf2020-mse-6-59dfb6f9.pth.tar" to /home/jovyan/.cache/torch/hub/checkpoints/ssf2020-mse-6-59dfb6f9.pth.tar
100%|██████████| 133M/133M [00:24<00:00, 5.81MB/s] 


ScaleSpaceFlow(
  (img_encoder): Encoder(
    (0): Conv2d(3, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (3): ReLU(inplace=True)
    (4): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (5): ReLU(inplace=True)
    (6): Conv2d(128, 192, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  )
  (img_decoder): Decoder(
    (0): ConvTranspose2d(192, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), output_padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): ConvTranspose2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), output_padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): ConvTranspose2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), output_padding=(1, 1))
    (5): ReLU(inplace=True)
    (6): ConvTranspose2d(128, 3, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), output_padding=(1, 1))
  )
  (img_hyperprio

In [45]:
video_codec = True
PAD_M = 128
to_tensor = transforms.ToTensor()
frames = sorted(glob.glob(os.path.join(img_folder, "*.jpg"))
                + glob.glob(os.path.join(img_folder, "*.png")))

In [47]:
total_bits, orig_hws = 0, []
strings_list, shapes_list = [], []

clip = []
for fp in tqdm(frame_paths, desc="Loading frames"):
    img = Image.open(fp).convert("RGB")
    x   = to_tensor(img).unsqueeze(0).to(device)
    x, hw = pad_to_multiple(x, PAD_M)
    clip.append(x)
    orig_hws.append(hw)

with torch.no_grad():
    strings_list, shapes_list = model.compress(clip)

for i, (s, sh, hw) in enumerate(zip(strings_list, shapes_list, orig_hws)):
    torch.save({"strings": s, "shape": sh, "orig_hw": hw},
                os.path.join(output_folder, f"{i:06d}.pth"))
    total_bits += bits_in(s)

Loading frames: 100%|██████████| 450/450 [00:23<00:00, 19.22it/s]


In [48]:
print(f"Compressed → {output_folder}")
print(f"Total size : {total_bits/8/1024:.1f} kB")

Compressed → BoostTrack/data/MOT17-compressed/MOT17-01-DPM
Total size : 125.1 kB


In [49]:
psnr_sum = 0.0
n_pixels = 0

with torch.no_grad():
    if video_codec:
        recon_clip = model.decompress(strings_list, shapes_list)

    for i, fp in enumerate(tqdm(frame_paths, desc="Scoring")):
        if video_codec:
            x_hat = recon_clip[i]
        else:
            x_hat = model.decompress([strings_list[i]], [shapes_list[i]])[0]

        H, W  = orig_hws[i]
        x_hat = x_hat[..., :H, :W].clamp_(0, 1)

        x_ref = to_tensor(Image.open(fp).convert("RGB")).to(device)
        mse   = F.mse_loss(x_hat, x_ref)
        psnr  = -10 * torch.log10(mse)

        psnr_sum += psnr.item()
        n_pixels += H * W

avg_psnr = psnr_sum / len(frame_paths)
bpp      = total_bits / n_pixels

print(f"\nSequence average →  {bpp:.4f} bpp   |   {avg_psnr:.2f} dB PSNR")

  mse   = F.mse_loss(x_hat, x_ref)
Scoring: 100%|██████████| 450/450 [00:24<00:00, 18.15it/s]


Sequence average →  0.0011 bpp   |   40.12 dB PSNR





In [26]:
import os, glob, math, itertools, shutil, gc
import torch, torch.nn.functional as F
from PIL import Image
from tqdm.auto import tqdm

import numpy as _np
if not hasattr(_np, "object"):
    _np.object = object
    
from torchvision import transforms
from compressai.zoo import bmshj2018_factorized, ssf2020 

import cv2

In [19]:
mot17_root = "BoostTrack/data/MOT17_original/test"     
out_root   = "BoostTrack/data/MOT17/test"

codec_name = "ssf2020"        
quality    = 6

# Set GOP_SIZE=None to encode an entire sequence in one call.
GOP_SIZE   = 12 if codec_name.startswith("ssf") else None

device     = "cuda" if torch.cuda.is_available() else "cpu"
print("Using", device)

Using cuda


In [20]:
def pad_to_multiple(x, m):
    """Reflect-pad so (H, W) is divisible by *m*."""
    B, C, H, W = x.shape
    Hp, Wp = (m - H % m) % m, (m - W % m) % m
    return F.pad(x, (0, Wp, 0, Hp), mode="reflect"), (H, W)

def flatten(nested):
    for el in nested:
        if isinstance(el, (list, tuple)):
            yield from flatten(el)
        else:
            yield el

def bits_in(strings):
    return sum(len(s) * 8 for s in flatten(strings))

to_tensor = transforms.ToTensor()

In [21]:
if codec_name == "bmshj2018_factorized":
    model      = bmshj2018_factorized(quality=quality, pretrained=True).eval().to(device)
    video_code = False
    PAD_M      = 64
elif codec_name == "ssf2020":
    model      = ssf2020(quality=quality, pretrained=True).eval().to(device)
    video_code = True
    PAD_M      = 128         # SSF ↓16, hyper-prior ↑8 → needs /128
else:
    raise ValueError(f"Unknown codec {codec_name}")

print(f"Loaded {codec_name}-Q{quality}  (video={video_code})")

Loaded ssf2020-Q6  (video=True)


In [22]:
sequences = [d for d in os.listdir(mot17_root)
             if os.path.isdir(os.path.join(mot17_root, d, "img1"))]
sequences.sort()
print("Found", len(sequences), "sequences:")
for s in sequences: print(" •", s)

Found 21 sequences:
 • MOT17-01-DPM
 • MOT17-01-FRCNN
 • MOT17-01-SDP
 • MOT17-03-DPM
 • MOT17-03-FRCNN
 • MOT17-03-SDP
 • MOT17-06-DPM
 • MOT17-06-FRCNN
 • MOT17-06-SDP
 • MOT17-07-DPM
 • MOT17-07-FRCNN
 • MOT17-07-SDP
 • MOT17-08-DPM
 • MOT17-08-FRCNN
 • MOT17-08-SDP
 • MOT17-12-DPM
 • MOT17-12-FRCNN
 • MOT17-12-SDP
 • MOT17-14-DPM
 • MOT17-14-FRCNN
 • MOT17-14-SDP


In [None]:
summary = []          # will collect (seq, frames, bpp, psnr)
for seq in sequences:
    print(f"\n=== {seq} ===")
    img_dir = os.path.join(mot17_root, seq, "img1")
    out_dir = os.path.join(out_root,   seq)
    os.makedirs(out_dir, exist_ok=True)

    out_dir_img1 = os.path.join(out_root, seq, "img1")
    os.makedirs(out_dir_img1, exist_ok=True)

    det_src = os.path.join(mot17_root, seq, "det")
    det_dst = os.path.join(out_root,   seq, "det")
    if not os.path.exists(det_dst):
        shutil.copytree(det_src, det_dst)
    shutil.copy(os.path.join(mot17_root, seq, "seqinfo.ini"),
                os.path.join(out_root,   seq, "seqinfo.ini"))

    frame_paths = sorted(glob.glob(os.path.join(img_dir, "*.jpg")) +
                         glob.glob(os.path.join(img_dir, "*.png")))
    n_frames = len(frame_paths)
    print("Frames:", n_frames)

    total_bits, psnr_sum, n_pixels = 0, 0.0, 0

    if video_code:
        # ── process in chunks (=GOPs) to control memory
        gop = n_frames if GOP_SIZE is None else GOP_SIZE
        for g in range(0, n_frames, gop):
            end = min(g + gop, n_frames)
            clip, orig_hws = [], []
            for fp in frame_paths[g:end]:
                img = Image.open(fp).convert("RGB")
                x, hw = pad_to_multiple(to_tensor(img).unsqueeze(0).to(device), PAD_M)
                clip.append(x); orig_hws.append(hw)

            with torch.no_grad():
                strings, shapes = model.compress(clip)

            # save + metrics
            recon = model.decompress(strings, shapes)
            for i, (st, sh, hw, x_hat, fp) in enumerate(zip(
                    strings, shapes, orig_hws, recon, frame_paths[g:end])):
                idx = g + i
                # torch.save({"strings": st, "shape": sh, "orig_hw": hw},
                #            os.path.join(out_dir, f"{idx:06d}.pth"))
                total_bits += bits_in(st)

                H, W = hw
                x_hat = x_hat[..., :H, :W].clamp_(0, 1)
                x_ref = to_tensor(Image.open(fp).convert("RGB")).to(device)
                mse   = F.mse_loss(x_hat, x_ref)
                psnr  = -10 * torch.log10(mse)
                psnr_sum += psnr.item()
                n_pixels += H * W

                rgb8 = (x_hat.squeeze(0).permute(1, 2, 0).clamp_(0, 1).cpu().detach().numpy() * 255).round().astype('uint8')

                # write JPEG with MOT naming
                cv2.imwrite(os.path.join(out_dir_img1, f"{idx+1:06d}.jpg"),
                            cv2.cvtColor(rgb8, cv2.COLOR_RGB2BGR),
                            [cv2.IMWRITE_JPEG_QUALITY, 95])

            # free GPU mem each GOP
            del clip, recon, strings, shapes
            torch.cuda.empty_cache(); gc.collect()

    else:  # ── image codec
        for idx, fp in enumerate(tqdm(frame_paths, desc="Compressing")):
            img = Image.open(fp).convert("RGB")
            x, hw = pad_to_multiple(to_tensor(img).unsqueeze(0).to(device), PAD_M)
            with torch.no_grad():
                out = model.compress(x)
                x_hat = model.decompress(out["strings"], out["shape"])[0]

            torch.save({"strings": out["strings"], "shape": out["shape"], "orig_hw": hw},
                       os.path.join(out_dir, f"{idx:06d}.pth"))
            total_bits += bits_in(out["strings"])

            H, W  = hw
            x_hat = x_hat[..., :H, :W].clamp_(0, 1)
            mse   = F.mse_loss(x_hat, to_tensor(img).to(device))
            psnr  = -10 * torch.log10(mse)
            psnr_sum += psnr.item()
            n_pixels += H * W

    bpp  = total_bits / n_pixels
    psnr = psnr_sum / n_frames
    summary.append((seq, n_frames, bpp, psnr))
    print(f"→ {bpp:.4f} bpp   |   {psnr:.2f} dB PSNR")


=== MOT17-01-DPM ===
Frames: 450


  mse   = F.mse_loss(x_hat, x_ref)


→ 0.0384 bpp   |   40.24 dB PSNR

=== MOT17-01-FRCNN ===
Frames: 450
→ 0.0384 bpp   |   40.24 dB PSNR

=== MOT17-01-SDP ===
Frames: 450
→ 0.0384 bpp   |   40.24 dB PSNR

=== MOT17-03-DPM ===
Frames: 1500
→ 0.0288 bpp   |   41.14 dB PSNR

=== MOT17-03-FRCNN ===
Frames: 1500
→ 0.0288 bpp   |   41.14 dB PSNR

=== MOT17-03-SDP ===
Frames: 1500
→ 0.0288 bpp   |   41.14 dB PSNR

=== MOT17-06-DPM ===
Frames: 1194


  mse   = F.mse_loss(x_hat, x_ref)


→ 0.0341 bpp   |   35.63 dB PSNR

=== MOT17-06-FRCNN ===
Frames: 1194
→ 0.0341 bpp   |   36.94 dB PSNR

=== MOT17-06-SDP ===
Frames: 1194
→ 0.0341 bpp   |   35.35 dB PSNR

=== MOT17-07-DPM ===
Frames: 500


In [None]:
print("\n===========  SUMMARY  ===========")
print(f"{'Sequence':20}  Frames   BPP     PSNR")
for seq, n, bpp, psnr in summary:
    print(f"{seq:20}  {n:6d}   {bpp:5.4f}   {psnr:6.2f}")
overall_bpp  = sum(bpp*n for (_,n,bpp,_) in summary) / sum(n for (_,n,_,_) in summary)
overall_psnr = sum(psnr*n for (_,n,_,psnr) in summary) / sum(n for (_,n,_,_) in summary)
print("----------------------------------------------")
print(f"{'Overall':20}          {overall_bpp:5.4f}   {overall_psnr:6.2f}")

In [None]:
mot17_root = "BoostTrack/data/MOT17_original/train"     
out_root   = "BoostTrack/data/MOT17/train"

In [None]:
sequences = [d for d in os.listdir(mot17_root)
             if os.path.isdir(os.path.join(mot17_root, d, "img1"))]
sequences.sort()
print("Found", len(sequences), "sequences:")
for s in sequences: print(" •", s)

In [None]:
summary = []          # will collect (seq, frames, bpp, psnr)
for seq in sequences:
    print(f"\n=== {seq} ===")
    img_dir = os.path.join(mot17_root, seq, "img1")
    out_dir = os.path.join(out_root,   seq)
    os.makedirs(out_dir, exist_ok=True)

    out_dir_img1 = os.path.join(out_root, seq, "img1")
    os.makedirs(out_dir_img1, exist_ok=True)

    det_src = os.path.join(mot17_root, seq, "det")
    det_dst = os.path.join(out_root,   seq, "det")
    if not os.path.exists(det_dst):
        shutil.copytree(det_src, det_dst)
    shutil.copy(os.path.join(mot17_root, seq, "seqinfo.ini"),
                os.path.join(out_root,   seq, "seqinfo.ini"))

    frame_paths = sorted(glob.glob(os.path.join(img_dir, "*.jpg")) +
                         glob.glob(os.path.join(img_dir, "*.png")))
    n_frames = len(frame_paths)
    print("Frames:", n_frames)

    total_bits, psnr_sum, n_pixels = 0, 0.0, 0

    if video_code:
        # ── process in chunks (=GOPs) to control memory
        gop = n_frames if GOP_SIZE is None else GOP_SIZE
        for g in range(0, n_frames, gop):
            end = min(g + gop, n_frames)
            clip, orig_hws = [], []
            for fp in frame_paths[g:end]:
                img = Image.open(fp).convert("RGB")
                x, hw = pad_to_multiple(to_tensor(img).unsqueeze(0).to(device), PAD_M)
                clip.append(x); orig_hws.append(hw)

            with torch.no_grad():
                strings, shapes = model.compress(clip)

            # save + metrics
            recon = model.decompress(strings, shapes)
            for i, (st, sh, hw, x_hat, fp) in enumerate(zip(
                    strings, shapes, orig_hws, recon, frame_paths[g:end])):
                idx = g + i
                # torch.save({"strings": st, "shape": sh, "orig_hw": hw},
                #            os.path.join(out_dir, f"{idx:06d}.pth"))
                total_bits += bits_in(st)

                H, W = hw
                x_hat = x_hat[..., :H, :W].clamp_(0, 1)
                x_ref = to_tensor(Image.open(fp).convert("RGB")).to(device)
                mse   = F.mse_loss(x_hat, x_ref)
                psnr  = -10 * torch.log10(mse)
                psnr_sum += psnr.item()
                n_pixels += H * W

                rgb8 = (x_hat.squeeze(0).permute(1, 2, 0).clamp_(0, 1).cpu().detach().numpy() * 255).round().astype('uint8')

                # write JPEG with MOT naming
                cv2.imwrite(os.path.join(out_dir_img1, f"{idx+1:06d}.jpg"),
                            cv2.cvtColor(rgb8, cv2.COLOR_RGB2BGR),
                            [cv2.IMWRITE_JPEG_QUALITY, 95])

            # free GPU mem each GOP
            del clip, recon, strings, shapes
            torch.cuda.empty_cache(); gc.collect()

    else:  # ── image codec
        for idx, fp in enumerate(tqdm(frame_paths, desc="Compressing")):
            img = Image.open(fp).convert("RGB")
            x, hw = pad_to_multiple(to_tensor(img).unsqueeze(0).to(device), PAD_M)
            with torch.no_grad():
                out = model.compress(x)
                x_hat = model.decompress(out["strings"], out["shape"])[0]

            torch.save({"strings": out["strings"], "shape": out["shape"], "orig_hw": hw},
                       os.path.join(out_dir, f"{idx:06d}.pth"))
            total_bits += bits_in(out["strings"])

            H, W  = hw
            x_hat = x_hat[..., :H, :W].clamp_(0, 1)
            mse   = F.mse_loss(x_hat, to_tensor(img).to(device))
            psnr  = -10 * torch.log10(mse)
            psnr_sum += psnr.item()
            n_pixels += H * W

    bpp  = total_bits / n_pixels
    psnr = psnr_sum / n_frames
    summary.append((seq, n_frames, bpp, psnr))
    print(f"→ {bpp:.4f} bpp   |   {psnr:.2f} dB PSNR")

In [None]:
print("\n===========  SUMMARY  ===========")
print(f"{'Sequence':20}  Frames   BPP     PSNR")
for seq, n, bpp, psnr in summary:
    print(f"{seq:20}  {n:6d}   {bpp:5.4f}   {psnr:6.2f}")
overall_bpp  = sum(bpp*n for (_,n,bpp,_) in summary) / sum(n for (_,n,_,_) in summary)
overall_psnr = sum(psnr*n for (_,n,_,psnr) in summary) / sum(n for (_,n,_,_) in summary)
print("----------------------------------------------")
print(f"{'Overall':20}          {overall_bpp:5.4f}   {overall_psnr:6.2f}")