In [29]:
# --- Cell 1: Imports & Config ---
import os, glob, math, random, json
from pathlib import Path
import numpy as np
import cv2
from collections import Counter

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

DATA_DIR = Path("data")
CLASS_NAMES = ["non_shop_lifters", "shop_lifters"]
MODELS_DIR = Path("models"); MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Training knobs (start here; tune later)
T = 8                 # frames per clip (try 8–12)
IMG_SIZE = 224       # 224 is good for MobileNet
BATCH_SIZE = 4       # clips are heavy; 4–8 on 8GB VRAM
EPOCHS_WARMUP = 6
EPOCHS_FT = 8
LR_WARMUP = 3e-4
LR_FT = 1e-4

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]


Device: cpu


In [30]:
# --- Cell 2: Index dataset & split by video ---
video_records = []
for ci, cname in enumerate(CLASS_NAMES):
    vids = sorted(glob.glob(str((DATA_DIR / cname / "*.mp4").resolve())))
    for v in vids:
        video_records.append((v, ci))
print(f"Found videos: {len(video_records)} | per class:",
      Counter([c for _, c in video_records]))

# Stratified split by class
from sklearn.model_selection import train_test_split
train_rec, tmp = train_test_split(
    video_records, test_size=0.30, random_state=SEED,
    stratify=[c for _, c in video_records]
)
val_rec, test_rec = train_test_split(
    tmp, test_size=0.50, random_state=SEED,
    stratify=[c for _, c in tmp]
)

print("Split | train:", len(train_rec), "val:", len(val_rec), "test:", len(test_rec))


Found videos: 855 | per class: Counter({0: 531, 1: 324})
Split | train: 598 val: 128 test: 129


In [31]:
# --- Canonical motion sampler (shared by train + app) ---
def sample_motion_frames(video_path, k=T, img_size=IMG_SIZE):
    import cv2, numpy as np
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened(): return None
    n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    ok, prev = cap.read()
    if not ok or n <= 1:
        cap.release(); return None
    prev_g = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
    frames = [prev]; diffs = []

    for i in range(1, n):
        ok, f = cap.read()
        if not ok: break
        frames.append(f)
        g = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY)
        diffs.append((i, float(cv2.absdiff(g, prev_g).mean())))
        prev_g = g
    cap.release()
    if not diffs: return None

    k = min(k, len(diffs))
    top = sorted(sorted(diffs, key=lambda x: x[1], reverse=True)[:k], key=lambda x: x[0])
    idxs = [i for i,_ in top]

    out = []
    for i in idxs:
        f = frames[i]
        f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
        f = cv2.resize(f, (img_size, img_size), interpolation=cv2.INTER_AREA)
        out.append(f.astype("float32"))
    clip = np.stack(out, axis=0) if out else None
    if clip is None: return None
    if clip.shape[0] < T:
        pad = np.repeat(clip[-1:], T - clip.shape[0], axis=0)
        clip = np.concatenate([clip, pad], axis=0)
    return clip  # (T,H,W,3) float32 0..255

# --- Canonical frames->tensor normalization (shared) ---
import torch
from torchvision import transforms
to_tensor = transforms.ToTensor()
normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)

def clip_to_tensor(clip_hwc_uint8_or_float):
    import numpy as np
    frames = []
    for f in clip_hwc_uint8_or_float:
        x = to_tensor(f.astype(np.uint8))      # [0,1], CHW
        x = normalize(x)                       # ImageNet norm
        frames.append(x)
    return torch.stack(frames, dim=0)          # (T,3,H,W)



In [32]:
# --- Cell 4: Dataset that yields (T,3,H,W) normalized clips ---
class ClipDataset(Dataset):
    def __init__(self, records, t=T, img_size=IMG_SIZE, use_motion=True, aug=True):
        self.records = records
        self.t = t
        self.img_size = img_size
        self.use_motion = use_motion

        # frame-level transforms (ImageNet)
        self.to_tensor = transforms.ToTensor()
        self.normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)

        # mild augmentation applied per frame
        if aug:
            self.aug = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.ColorJitter(0.1,0.1,0.1,0.05),
            ])
        else:
            self.aug = None

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        vpath, label = self.records[idx]
        clip = sample_motion_frames(vpath, k=self.t, img_size=self.img_size)
        if clip is None:
            # fallback: try re-read by uniform sampling of T indices
            cap = cv2.VideoCapture(str(vpath))
            n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            idxs = np.linspace(0, max(n-1,0), num=self.t).astype(int)
            frames=[]
            for i in idxs:
                cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
                ok, f = cap.read()
                if not ok: continue
                f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                f = cv2.resize(f, (self.img_size, self.img_size), interpolation=cv2.INTER_AREA)
                frames.append(f.astype("float32"))
            cap.release()
            if not frames:
                raise RuntimeError(f"Could not read frames from {vpath}")
            clip = np.stack(frames, axis=0)
            if clip.shape[0] < self.t:
                pad = np.repeat(clip[-1:], self.t-clip.shape[0], axis=0)
                clip = np.concatenate([clip, pad], axis=0)

        # HWC -> CHW per frame; stack to (T,3,H,W)
        tensors = []
        for f in clip:
            img = f.astype(np.uint8)
            if self.aug:  # aug expects PIL / Tensor; we’ll apply on tensor
                x = self.to_tensor(img)
                x = self.aug(x)
            else:
                x = self.to_tensor(img)
            x = self.normalize(x)   # (3,H,W)
            tensors.append(x)
        xclip = torch.stack(tensors, dim=0)   # (T,3,H,W)
        y = torch.tensor(label, dtype=torch.long)
        return xclip, y

ds_train = ClipDataset(train_rec, aug=True)
ds_val   = ClipDataset(val_rec, aug=False)

dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
dl_val   = DataLoader(ds_val,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
len(ds_train), len(ds_val)


(598, 128)

In [33]:
# --- Cell 5 (GRU version): Temporal model ---
import torch
from torch import nn
import torchvision

class TemporalClassifier(nn.Module):
    def __init__(self, num_classes=2, backbone_name="mobilenet_v3_small",
                 hidden=128, bidirectional=True):
        super().__init__()
        # Frame encoder (shared)
        if backbone_name == "mobilenet_v3_small":
            m = torchvision.models.mobilenet_v3_small(
                weights=torchvision.models.MobileNet_V3_Small_Weights.IMAGENET1K_V1
            )
            self.backbone = m.features              # conv body
            feat_dim = 576                          # last conv channels
        else:
            raise ValueError("Backbone not supported here.")

        self.gap = nn.AdaptiveAvgPool2d(1)

        # Temporal module: BiGRU over per-frame features
        self.gru = nn.GRU(
            input_size=feat_dim,
            hidden_size=hidden,
            num_layers=1,
            batch_first=True,
            bidirectional=bidirectional,
        )
        fc_in = hidden * (2 if bidirectional else 1)

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(fc_in, num_classes)

    def encode_frames(self, x):
        # x: (B,T,3,H,W) -> (B,T,Cfeat)
        B, T, C, H, W = x.shape
        x = x.view(B*T, C, H, W)
        f = self.backbone(x)                # (B*T, C', h, w)
        f = self.gap(f).squeeze(-1).squeeze(-1)  # (B*T, C')
        f = f.view(B, T, -1)                # (B, T, C')
        return f

    def forward(self, x):
        # Encode frames, then GRU, then temporal mean-pooling over time
        f = self.encode_frames(x)           # (B,T,Cfeat)
        out, _ = self.gru(f)                # (B,T,hidden*dir)
        f = out.mean(dim=1)                 # mean over time (robust + simple)
        f = self.dropout(f)
        return self.fc(f)

model = TemporalClassifier(num_classes=len(CLASS_NAMES), hidden=128, bidirectional=True).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR_WARMUP)
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))


In [34]:
# --- Cell 6 (GRU): Tiny overfit sanity on few clips ---
tiny = min(24, len(ds_train))
Xb, yb = [], []
for i in range(tiny):
    x,y = ds_train[i]
    Xb.append(x); yb.append(y)
Xb = torch.stack(Xb, dim=0).to(DEVICE)  # (tiny,T,3,H,W)
yb = torch.stack(yb, dim=0).to(DEVICE)

probe = TemporalClassifier(num_classes=2, hidden=128, bidirectional=True).to(DEVICE)
opt = torch.optim.Adam(probe.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()

probe.train()
for ep in range(15):
    opt.zero_grad()
    with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
        logits = probe(Xb)
        loss = crit(logits, yb)
    loss.backward()
    nn.utils.clip_grad_norm_(probe.parameters(), max_norm=5.0)  # helps GRU stability
    opt.step()

acc = (logits.argmax(1) == yb).float().mean().item()
print("Tiny overfit acc:", acc)


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Tiny overfit acc: 1.0


In [35]:
# === CONSOLIDATED DATA PIPELINE (Windows-safe, motion clips, robust) ===
# Drop this in once. It replaces your sampler, dataset, and dataloaders.

import os, glob, random
from pathlib import Path
import numpy as np
import cv2
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms

# ---------- knobs ----------
T = 8                 # frames per clip (try 8–12 for more temporal context)
IMG_SIZE = 224        # 224 for MobileNet; use 192 for speed if needed
BATCH_SIZE = 4        # clips are heavy; 3–6 is typical on 8GB VRAM
NUM_WORKERS = 0       # Windows/notebook-safe. Increase only in a .py script with if __name__ == "__main__"
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

# ---------- helpers ----------
def is_video_ok(path):
    cap = cv2.VideoCapture(str(path))
    ok = cap.isOpened()
    n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if ok else 0
    cap.release()
    return ok and n >= 2

def filter_records(records):
    good, bad = [], []
    for vpath, cid in records:
        if is_video_ok(vpath):
            good.append((vpath, cid))
        else:
            bad.append(vpath)
    print(f"[filter] kept: {len(good)} | dropped: {len(bad)}")
    if bad: print("  dropped examples:", bad[:5])
    return good

def sample_motion_frames(video_path, k=T, img_size=IMG_SIZE):
    """Pick the top-k frames by average frame-difference; return (T,H,W,3) float32 0..255."""
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened(): return None
    n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    ok, prev = cap.read()
    if not ok or n <= 1:
        cap.release(); return None
    prev_g = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
    frames = [prev]; diffs = []

    for i in range(1, n):
        ok, f = cap.read()
        if not ok: break
        frames.append(f)
        g = cv2.cvtColor(f, cv2.COLOR_BGR2GRAY)
        diffs.append((i, float(cv2.absdiff(g, prev_g).mean())))
        prev_g = g
    cap.release()
    if not diffs: return None

    k = min(k, len(diffs)) if diffs else 1
    top = sorted(sorted(diffs, key=lambda x: x[1], reverse=True)[:k], key=lambda x: x[0])
    idxs = [i for i, _ in top]

    out = []
    for i in idxs:
        f = frames[i]
        f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
        f = cv2.resize(f, (img_size, img_size), interpolation=cv2.INTER_AREA)
        out.append(f.astype("float32"))
    if not out: return None
    clip = np.stack(out, axis=0)  # (t,h,w,3)
    if clip.shape[0] < T:  # pad to T
        pad = np.repeat(clip[-1:], T - clip.shape[0], axis=0)
        clip = np.concatenate([clip, pad], axis=0)
    return clip

class ClipDataset(Dataset):
    """Yields (T,3,H,W) normalized clips with robust loading/retries."""
    def __init__(self, records, t=T, img_size=IMG_SIZE, use_motion=True, aug=True, max_retries=3):
        self.records = records
        self.t = t
        self.img_size = img_size
        self.use_motion = use_motion
        self.max_retries = max_retries

        self.to_tensor = transforms.ToTensor()
        self.normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
        self.aug = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ColorJitter(0.1,0.1,0.1,0.05),
        ]) if aug else None

    def __len__(self): return len(self.records)

    def _read_clip(self, vpath):
        clip = sample_motion_frames(vpath, k=self.t, img_size=self.img_size) if self.use_motion else None
        if clip is None:
            cap = cv2.VideoCapture(str(vpath))
            if not cap.isOpened(): return None
            n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            if n <= 0:
                cap.release(); return None
            idxs = np.linspace(0, n - 1, num=self.t).astype(int)
            frames = []
            for i in idxs:
                cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
                ok, f = cap.read()
                if not ok or f is None: continue
                f = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                f = cv2.resize(f, (self.img_size, self.img_size), interpolation=cv2.INTER_AREA)
                frames.append(f.astype("float32"))
            cap.release()
            if not frames: return None
            clip = np.stack(frames, axis=0)
        if clip.shape[0] < self.t:
            pad = np.repeat(clip[-1:], self.t - clip.shape[0], axis=0)
            clip = np.concatenate([clip, pad], axis=0)
        return clip

    def __getitem__(self, idx):
        tries = 0
        while tries < self.max_retries:
            vpath, label = self.records[idx]
            try:
                clip = self._read_clip(vpath)
                if clip is None: raise RuntimeError("empty clip")
                frames = []
                for f in clip:
                    x = self.to_tensor(f.astype(np.uint8))  # [0,1]
                    if self.aug: x = self.aug(x)
                    x = self.normalize(x)                   # ImageNet norm
                    frames.append(x)
                xclip = torch.stack(frames, dim=0)          # (T,3,H,W)
                return xclip, torch.tensor(label, dtype=torch.long)
            except Exception:
                tries += 1
                idx = random.randrange(len(self.records))  # try different sample
        # final fallback: zero clip
        xclip = torch.zeros((self.t, 3, self.img_size, self.img_size), dtype=torch.float32)
        return xclip, torch.tensor(self.records[0][1], dtype=torch.long)

def build_dataloaders(train_rec, val_rec):
    # filter bad videos first
    train_rec = filter_records(train_rec)
    val_rec   = filter_records(val_rec)

    ds_train = ClipDataset(train_rec, aug=True, use_motion=True)
    ds_val   = ClipDataset(val_rec,   aug=False, use_motion=True)

    dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    dl_val   = DataLoader(ds_val,   batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=torch.cuda.is_available())
    return ds_train, ds_val, dl_train, dl_val

# Build dataloaders (expects you already defined train_rec / val_rec earlier)
ds_train, ds_val, dl_train, dl_val = build_dataloaders(train_rec, val_rec)

print(f"[ready] train clips: {len(ds_train)} | val clips: {len(ds_val)} | "
      f"batch={BATCH_SIZE} | workers={NUM_WORKERS}")

# quick smoke test: load one batch
try:
    xb, yb = next(iter(dl_train))
    print("batch shapes:", tuple(xb.shape), tuple(yb.shape))  # expect (B,T,3,H,W), (B,)
except Exception as e:
    print("loader smoke test failed:", e)


[filter] kept: 598 | dropped: 0
[filter] kept: 128 | dropped: 0
[ready] train clips: 598 | val clips: 128 | batch=4 | workers=0
batch shapes: (4, 8, 3, 224, 224) (4,)


In [36]:
# --- Sanity A: basic counts and imbalance checks ---
from collections import Counter
from pathlib import Path
import os, glob

def class_counts(recs):
    return Counter([c for _, c in recs])

def peek_paths(recs, k=3):
    return [p for p,_ in recs[:k]]

print("DATA DIR:", DATA_DIR.resolve())
for cname in CLASS_NAMES:
    n = len(glob.glob(str(DATA_DIR / cname / "*.mp4")))
    print(f"class '{cname}' videos: {n}")

print("SPLIT sizes | train/val/test:", len(train_rec), len(val_rec), len(test_rec))
print("train counts:", class_counts(train_rec))
print("val   counts:", class_counts(val_rec))
print("test  counts:", class_counts(test_rec))

assert len(train_rec) and len(val_rec), "Empty split!"
assert set([c for _,c in train_rec]) == {0,1}, "Train split missing a class!"
assert set([c for _,c in val_rec])   == {0,1}, "Val split missing a class!"


DATA DIR: C:\Users\Apple\Desktop\CV_last\data
class 'non_shop_lifters' videos: 531
class 'shop_lifters' videos: 324
SPLIT sizes | train/val/test: 598 128 129
train counts: Counter({0: 371, 1: 227})
val   counts: Counter({0: 80, 1: 48})
test  counts: Counter({0: 80, 1: 49})


In [37]:
# --- Sanity B: save & reload the CLASS_NAMES immediately ---
import json
(MODELS_DIR).mkdir(exist_ok=True, parents=True)
with open(MODELS_DIR / "class_names.json", "w", encoding="utf-8") as f:
    json.dump(CLASS_NAMES, f)

with open(MODELS_DIR / "class_names.json", "r", encoding="utf-8") as f:
    loaded = json.load(f)
assert loaded == CLASS_NAMES, f"Class name mismatch! {loaded} vs {CLASS_NAMES}"
print("Class mapping OK:", loaded)


Class mapping OK: ['non_shop_lifters', 'shop_lifters']


In [38]:
# --- Sanity C: loader smoke & clip stats ---
xb, yb = next(iter(dl_train))
print("batch:", tuple(xb.shape), "labels:", yb.tolist())
print("per-batch std:", float(xb.std()))
assert float(xb.std()) > 0.01, "Clips look constant/blank; check sampling/paths/codecs."


batch: (4, 8, 3, 224, 224) labels: [1, 1, 1, 1]
per-batch std: 1.0036391019821167


In [39]:
# --- Training with requested prints, proper warmup (frozen backbone) + finetune (last 3 blocks) ---
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from torch import nn

# ===== 0) Class weights (helps if imbalanced) =====
y_train = np.array([c for _, c in train_rec])
cls_w = compute_class_weight(class_weight="balanced", classes=np.array([0,1]), y=y_train)
cls_w = torch.tensor(cls_w, dtype=torch.float32).to(DEVICE)
print("class weights:", cls_w.tolist())

criterion = nn.CrossEntropyLoss(weight=cls_w)
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))

def set_trainable(module, flag: bool):
    for p in module.parameters():
        p.requires_grad = flag

def run_epoch(dl, model, optimizer=None):
    train = optimizer is not None
    model.train(train)
    total_loss, correct, total = 0.0, 0, 0

    for xb, yb in dl:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            logits = model(xb)
            loss = criterion(logits, yb)

        if train:
            optimizer.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            # GRU stability + general safety
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            scaler.step(optimizer)
            scaler.update()

        total_loss += loss.item() * xb.size(0)
        correct += (logits.argmax(1) == yb).sum().item()
        total += xb.size(0)

    return total_loss/total, correct/total

# ===== 1) WARMUP: freeze ALL backbone, train only GRU + FC =====
set_trainable(model.backbone, False)     # <- freeze CNN
set_trainable(model.gru, True)
set_trainable(model.fc, True)
set_trainable(model.gap, True)           # GAP has no params but harmless

# Optimizer: only head params
head_params = list(model.gru.parameters()) + list(model.fc.parameters())
optimizer = torch.optim.Adam(head_params, lr=LR_WARMUP)

best_val = 0.0
for epoch in range(1, EPOCHS_WARMUP+1):
    tr_loss, tr_acc = run_epoch(dl_train, model, optimizer)
    va_loss, va_acc = run_epoch(dl_val,   model, optimizer=None)

    if va_acc > best_val:
        best_val = va_acc
        torch.save(model.state_dict(), MODELS_DIR / "shoplift_torch_state.pth")

    # EXACT format you requested
    print(f"loss = {va_loss:.6f}")
    print(f"val = {va_acc:.6f}")
    print(f"train = {tr_acc:.6f}")

# ===== 2) FINETUNE: unfreeze last 3 MobileNet blocks (keep early layers frozen) =====
# Freeze all first
set_trainable(model.backbone, False)
# Then unfreeze last 3 blocks
for i in range(len(model.backbone) - 3, len(model.backbone)):
    set_trainable(model.backbone[i], True)

# Keep GRU/FC trainable
set_trainable(model.gru, True)
set_trainable(model.fc, True)

# Discriminative learning rates:
#   - lower LR for (newly) unfrozen CNN blocks
#   - higher LR for temporal head (GRU/FC)
backbone_ft_params = []
for i in range(len(model.backbone) - 3, len(model.backbone)):
    backbone_ft_params += list(model.backbone[i].parameters())

optimizer = torch.optim.Adam([
    {"params": backbone_ft_params, "lr": LR_FT * 0.5},   # smaller LR for conv
    {"params": model.gru.parameters(), "lr": LR_FT},     # higher LR for head
    {"params": model.fc.parameters(), "lr": LR_FT},
])

for epoch in range(1, EPOCHS_FT+1):
    tr_loss, tr_acc = run_epoch(dl_train, model, optimizer)
    va_loss, va_acc = run_epoch(dl_val,   model, optimizer=None)

    if va_acc > best_val:
        best_val = va_acc
        torch.save(model.state_dict(), MODELS_DIR / "shoplift_torch_state.pth")

    # EXACT format again
    print(f"loss = {va_loss:.6f}")
    print(f"val = {va_acc:.6f}")
    print(f"train = {tr_acc:.6f}")

print("Best val acc:", best_val)


class weights: [0.8059298992156982, 1.3171806335449219]


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))
  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


loss = 0.532479
val = 0.882812
train = 0.652174
loss = 0.365017
val = 0.875000
train = 0.759197
loss = 0.258715
val = 0.914062
train = 0.794314
loss = 0.237759
val = 0.906250
train = 0.806020
loss = 0.206483
val = 0.921875
train = 0.814381
loss = 0.221573
val = 0.906250
train = 0.795987
loss = 0.116287
val = 0.960938
train = 0.891304
loss = 0.047598
val = 0.992188
train = 0.921405
loss = 0.008955
val = 1.000000
train = 0.963211
loss = 0.003672
val = 1.000000
train = 0.976589
loss = 0.002440
val = 1.000000
train = 0.973244
loss = 0.003858
val = 1.000000
train = 0.976589
loss = 0.001004
val = 1.000000
train = 0.969900
loss = 0.000572
val = 1.000000
train = 0.979933
Best val acc: 1.0


In [40]:
# --- Robust video-level evaluation on test set ---
from sklearn.metrics import classification_report, confusion_matrix
model.load_state_dict(torch.load(MODELS_DIR / "shoplift_torch_state.pth", map_location=DEVICE))
model.eval()

def predict_video(video_path):
    clip = sample_motion_frames(video_path, k=T, img_size=IMG_SIZE)
    if clip is None: return None
    xb = clip_to_tensor(clip).unsqueeze(0).to(DEVICE)  # (1,T,3,H,W)
    with torch.no_grad():
        logits = model(xb)
        prob = torch.softmax(logits, dim=1)[0].cpu().numpy()
    return int(prob.argmax()), prob

y_true, y_pred = [], []
for vpath, cid in test_rec:
    res = predict_video(vpath)
    if res is None: continue
    pid, _ = res
    y_true.append(cid); y_pred.append(pid)

print(classification_report(y_true, y_pred, target_names=CLASS_NAMES))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))


                  precision    recall  f1-score   support

non_shop_lifters       0.99      1.00      0.99        80
    shop_lifters       1.00      0.98      0.99        49

        accuracy                           0.99       129
       macro avg       0.99      0.99      0.99       129
    weighted avg       0.99      0.99      0.99       129

Confusion matrix:
 [[80  0]
 [ 1 48]]


In [41]:
# --- Cell 8: Video-level evaluation ---
softmax = nn.Softmax(dim=1).to(DEVICE)
model.load_state_dict(torch.load(MODELS_DIR / "shoplift_torch_state.pth", map_location=DEVICE))
model.eval()

def predict_video(video_path):
    clip = sample_motion_frames(video_path, k=T, img_size=IMG_SIZE)
    if clip is None: return None
    # (T,H,W,3) -> (1,T,3,H,W)
    tlist = []
    for f in clip:
        x = transforms.functional.to_tensor(f.astype(np.uint8))
        x = transforms.functional.normalize(x, IMAGENET_MEAN, IMAGENET_STD)
        tlist.append(x)
    xb = torch.stack(tlist, dim=0).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        logits = model(xb)
        prob = softmax(logits)[0].cpu().numpy()  # (2,)
    pred = int(np.argmax(prob))
    return pred, prob

y_true, y_pred = [], []
for vpath, cid in test_rec:
    res = predict_video(vpath)
    if res is None: continue
    pid, _ = res
    y_true.append(cid); y_pred.append(pid)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred, target_names=CLASS_NAMES))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))


                  precision    recall  f1-score   support

non_shop_lifters       0.99      1.00      0.99        80
    shop_lifters       1.00      0.98      0.99        49

        accuracy                           0.99       129
       macro avg       0.99      0.99      0.99       129
    weighted avg       0.99      0.99      0.99       129

Confusion matrix:
 [[80  0]
 [ 1 48]]


In [42]:
# --- Cell 9: Save class names (PyTorch) ---
with open(MODELS_DIR / "class_names.json", "w", encoding="utf-8") as f:
    json.dump(CLASS_NAMES, f)
print("Saved:", MODELS_DIR / "shoplift_torch_state.pth")
print("Saved:", MODELS_DIR / "class_names.json")


Saved: models\shoplift_torch_state.pth
Saved: models\class_names.json
