# check 1

In [1]:
import time
import cv2
import os
from camera import CameraManager, create_camera_configs_from_ips
import torch
import torch.nn as nn
from torchvision.models.video import mvit_v1_b, MViT_V1_B_Weights
import mlflow
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop
from decord import VideoReader, cpu
from tqdm import tqdm
from ultralytics import YOLO
import numpy as np

# Clear GPU memory
torch.cuda.empty_cache()

# ============================
# MLflow Setup
# ============================
mlflow.set_tracking_uri("file:///home/smartan5070/Downloads/SlowfastTrainer-main/mlruns")


def clamp_bbox(bbox, h, w):
    """Clamp bounding box coordinates to ensure they are within image boundaries."""
    x1, y1, x2, y2 = bbox
    x1 = max(0, min(w - 1, int(x1)))
    y1 = max(0, min(h - 1, int(y1)))
    x2 = max(0, min(w, int(x2)))
    y2 = max(0, min(h, int(y2)))
    
    if x2 <= x1 or y2 <= y1:
        return None
    return x1, y1, x2, y2


class NormalizeVideo(nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        self.mean = torch.tensor(mean).view(3, 1, 1, 1)
        self.std = torch.tensor(std).view(3, 1, 1, 1)
    
    def forward(self, tensor):
        return (tensor - self.mean) / self.std


def load_model_from_mlflow(run_id, device):
    """Try to load model from MLflow"""
    try:
        artifact_names = ["model", "pytorch_model", "mvit_model", "best_model"]
        
        for artifact_name in artifact_names:
            try:
                model_uri = f"runs:/{run_id}/{artifact_name}"
                print(f"Trying to load from: {model_uri}")
                model = mlflow.pytorch.load_model(model_uri)
                model.to(device)
                model.eval()
                print(f"✓ Model loaded successfully from MLflow artifact '{artifact_name}'!")
                return model
            except Exception as e:
                print(f"  Failed with artifact name '{artifact_name}': {str(e)[:100]}")
                continue
        
        return None
        
    except Exception as e:
        print(f"MLflow loading failed: {e}")
        return None


def load_model_from_local(model_path, num_classes, K, device):
    """Load model from local .pt file"""
    try:
        print(f"Loading model from local file: {model_path}")
        
        weights = MViT_V1_B_Weights.DEFAULT
        model = mvit_v1_b(weights=weights)
        
        for param in model.parameters():
            param.requires_grad = False
        
        last_fc_layer = model.head[-1]
        in_features = last_fc_layer.in_features
        model.head[-1] = nn.Linear(in_features, num_classes)
        
        blocks = list(model.blocks)
        for block in blocks[-K:]:
            for p in block.parameters():
                p.requires_grad = True
        
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        
        model.to(device)
        model.eval()
        print("✓ Model loaded successfully from local file!")
        return model
        
    except Exception as e:
        print(f"Failed to load from local file: {e}")
        return None


def run_mvit_inference(video_path, model, device):
    """Performs inference on the given video using a pre-loaded model."""
    transform = Compose([
        Resize((256, 256)),
        CenterCrop(224),
        NormalizeVideo([0.45, 0.45, 0.45], [0.225, 0.225, 0.225])
    ])
    
    frames_per_clip = 16
    test_dataset = FlatVideoDataset(video_path, transform=transform, frames_per_clip=frames_per_clip)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, 
                            pin_memory=(device.type == 'cuda'))
    
    video_predictions = run_inference(test_loader, model, device)
    
    train_root = "/home/smartan5070/Downloads/SlowfastTrainer-main/Dataset_30Classes_Cam107-18_SPLIT/train"
    class_names = sorted(os.listdir(train_root))
    idx_to_class_name = {i: name for i, name in enumerate(class_names)}
    
    for path, prediction_idx in video_predictions.items():
        predicted_class = idx_to_class_name.get(prediction_idx, f"UNKNOWN_INDEX_{prediction_idx}")
        print(f"File: {os.path.basename(path):<50} -> Predicted Class: {predicted_class}")


class FlatVideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None, frames_per_clip=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.video_paths = []
        self._build_index()

    def _build_index(self):
        if os.path.isfile(self.root_dir) and self.root_dir.lower().endswith(".mp4"):
            self.video_paths.append(self.root_dir)
        elif os.path.isdir(self.root_dir):
            for fname in os.listdir(self.root_dir):
                if fname.lower().endswith(".mp4"):
                    self.video_paths.append(os.path.join(self.root_dir, fname))
        else:
            raise ValueError(f"{self.root_dir} is neither a .mp4 file nor a directory")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = -1
        
        try:
            vr = VideoReader(path, ctx=cpu(0))
            total_frames = len(vr)

            if total_frames < self.frames_per_clip:
                base = np.linspace(0, total_frames - 1, total_frames).astype(int)
                pad = self.frames_per_clip - total_frames
                frame_indices = np.concatenate([base, np.full((pad,), base[-1], dtype=int)])
            else:
                frame_indices = np.linspace(0, total_frames - 1, self.frames_per_clip).astype(int)

            frames = vr.get_batch(frame_indices).asnumpy()

            if frames.shape[-1] == 1:
                frames = np.repeat(frames, 3, axis=-1)
            elif frames.shape[-1] != 3:
                raise ValueError(f"Unsupported channel count: {frames.shape[-1]} in video {path}")

            frames = torch.from_numpy(frames).permute(3, 0, 1, 2).float() / 255.0

            if self.transform:
                frames = self.transform(frames)

            return frames, label, path

        except Exception as e:
            print(f"Failed to load video: {path}\nError: {e}")
            return self.__getitem__((idx + 1) % len(self))


def run_inference(test_loader, model, device):
    all_predictions = []
    all_video_paths = []
    
    model.eval()
    with torch.no_grad():
        for inputs, _, paths in tqdm(test_loader, desc="Inference"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            print(f"Predicted: {predicted}")
            all_predictions.extend(predicted.cpu().numpy().tolist())
            all_video_paths.extend(paths)

    return dict(zip(all_video_paths, all_predictions))


# Camera IPs and configurations
camera_ips = ["192.168.0.101"]
camera_configs = create_camera_configs_from_ips(camera_ips)

# Initialize camera manager
manager = CameraManager(display_width=640, display_height=480)

# Add cameras
for i, config in enumerate(camera_configs):
    camera_id = f"cam_{i+1}"
    manager.add_camera(camera_id, config)

print(f"Created {manager.get_camera_count()} cameras")
for camera_id in manager.get_camera_ids():
    camera = manager.get_camera(camera_id)
    print(f"  {camera_id}: {camera.get_name()} - {camera.get_config()['url']}")

# Define output folder path
output_folder = '/home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames'
os.makedirs(output_folder, exist_ok=True)

# Start cameras
manager.start_all_cameras()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# LOAD MODEL WITH FALLBACK
# ============================
print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)

run_id = "840f3d39813c41ba9880859c83a82b01"
local_model_path = "/home/smartan5070/Downloads/SlowfastTrainer-main/Models/Testing_30Classes_Cam10718/Testing_30Classes_Cam10718.pt"
num_classes = 30
K = 3

print("\n1. Trying to load from MLflow...")
mvit_model = load_model_from_mlflow(run_id, device)

if mvit_model is None:
    print("\n2. MLflow failed, trying local .pt file...")
    mvit_model = load_model_from_local(local_model_path, num_classes, K, device)

if mvit_model is None:
    print("\n✗ FAILED TO LOAD MODEL!")
    print("Please check:")
    print("1. MLflow run ID is correct")
    print("2. Local .pt file exists at:", local_model_path)
    print("3. num_classes and K parameters match your training setup")
    exit(1)

print("\n✓ Model loaded and ready!")
print("="*60 + "\n")

try:
    frame_batch = []
    BATCH_SIZE = 64
    fixed_crop_area = None
    model_yolo = YOLO("yolov8n.pt").to(device)
    
    # Frame rate control
    TARGET_FPS = 25
    frame_interval = 1.0 / TARGET_FPS  # Time between frames in seconds
    last_frame_hash = None
    
    print(f"Starting frame collection (Target: {BATCH_SIZE} frames at {TARGET_FPS} FPS)")
    print(f"Estimated collection time: {BATCH_SIZE/TARGET_FPS:.1f} seconds")
    print("-" * 60)

    while True:
        # Get current frame from camera
        frames = manager.get_frames()
        
        frame_added = False
        for cam_id, frame in frames.items():
            if frame is not None:
                # Create a simple hash to check if frame is different
                current_hash = hash(frame.tobytes())
                
                # Only add if it's a different frame
                if current_hash != last_frame_hash:
                    frame_batch.append(frame.copy())
                    last_frame_hash = current_hash
                    frame_added = True
                    print(f"✓ Frame {len(frame_batch)}/{BATCH_SIZE} collected", end='\r')
                
                break  # Only one camera
        
        # Control frame rate - wait for next frame
        if frame_added:
            time.sleep(frame_interval)
        else:
            # If same frame, wait a bit before checking again
            time.sleep(0.001)

        # Process batch when we have enough frames
        if len(frame_batch) >= BATCH_SIZE:
            print(f"\n\n{'='*60}")
            print(f"Processing batch of {len(frame_batch)} frames...")
            print('='*60)

            # STEP 1 — DETECT PERSON IN FIRST FRAME (RESET EACH TIME)
            fixed_crop_area = None  # Reset crop area for each new batch
            first_frame = frame_batch[0]
            results = model_yolo(first_frame, conf=0.5, verbose=False)
            r = results[0]

            best_box = None
            max_area = 0
            frame_h, frame_w = first_frame.shape[:2]

            for box in r.boxes:
                if int(box.cls.item()) == 0:  # person class
                    xyxy = box.xyxy[0].cpu().numpy()
                    x1, y1, x2, y2 = xyxy

                    clamped = clamp_bbox((x1, y1, x2, y2), frame_h, frame_w)
                    if not clamped:
                        continue

                    x1, y1, x2, y2 = clamped
                    area = (x2 - x1) * (y2 - y1)

                    if area > max_area:
                        max_area = area
                        best_box = (int(x1), int(y1), int(x2), int(y2))

            # STEP 2 — IF PERSON FOUND, SET CROP BOX
            if best_box is not None:
                x1, y1, x2, y2 = best_box
                PADDING = 15

                padded_box = (
                    max(0, x1 - PADDING),
                    max(0, y1 - PADDING),
                    min(frame_w, x2 + PADDING),
                    min(frame_h, y2 + PADDING)
                )
                fixed_crop_area = padded_box
                print(f"✓ Person detected at: {best_box}")
            else:
                print("✗ No person detected in first frame")

            # STEP 3 — If no crop area → skip this batch
            if fixed_crop_area is None:
                print("✗ No person detected in this batch. Skipping and resetting.")
                frame_batch = []
                last_frame_hash = None
                continue

            # STEP 4 — CROP ALL FRAMES
            x1, y1, x2, y2 = fixed_crop_area
            cropped_batch = []

            print(f"Cropping {len(frame_batch)} frames to region: ({x1}, {y1}) -> ({x2}, {y2})")
            for frame in frame_batch:
                crop = frame[y1:y2, x1:x2]
                cropped_batch.append(crop)

            # STEP 5 — SAVE VIDEO PROPERLY
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            video_filename = f"cropped_video_{timestamp}.mp4"
            video_path = os.path.join(output_folder, video_filename)

            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            H, W = cropped_batch[0].shape[:2]
            
            print(f"Saving video: {video_filename} ({W}x{H} @ {TARGET_FPS} FPS)")
            writer = cv2.VideoWriter(video_path, fourcc, TARGET_FPS, (W, H))

            for f in cropped_batch:
                writer.write(f)

            writer.release()
            print(f"✓ Video saved successfully!")

            # Verify video was saved correctly
            if os.path.exists(video_path):
                file_size = os.path.getsize(video_path) / 1024  # KB
                print(f"  File size: {file_size:.2f} KB")
                
                # Quick verification
                cap = cv2.VideoCapture(video_path)
                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                cap.release()
                print(f"  Frames in video: {total_frames}")

            # STEP 6 — RUN ACTION RECOGNITION
            print("\nRunning action recognition...")
            run_mvit_inference(video_path, mvit_model, device)

            # RESET FOR NEXT BATCH
            print(f"\n{'='*60}")
            print("Batch complete! Waiting for next batch...")
            print(f"{'='*60}\n")
            
            frame_batch = []
            last_frame_hash = None

        # Quit with 'q' key
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("\nQuitting...")
            break

finally:
    manager.stop_all_cameras()
    cv2.destroyAllWindows()
    print("Cleanup complete.")

Added camera cam_1: Camera 1
Created 1 cameras
  cam_1: Camera 1 - rtsp://admin:admin%40123@192.168.0.101:554/stream1


  return FileStore(store_uri, store_uri)
  from .autonotebook import tqdm as notebook_tqdm


Started camera cam_1
Started 1/1 cameras

LOADING MODEL

1. Trying to load from MLflow...
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'model': Failed to download artifacts from path 'model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/pytorch_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'pytorch_model': Failed to download artifacts from path 'pytorch_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/mvit_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'mvit_model': Failed to download artifacts from path 'mvit_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/best_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 186.64it/s]  


✓ Model loaded successfully from MLflow artifact 'best_model'!

✓ Model loaded and ready!

Starting frame collection (Target: 64 frames at 25 FPS)
Estimated collection time: 2.6 seconds
------------------------------------------------------------
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skippin

Inference: 100%|██████████| 1/1 [00:00<00:00, 22.19it/s]


Predicted: tensor([13], device='cuda:0')
File: cropped_video_20251205_113407.mp4                  -> Predicted Class: db_seated_hammercurl

Batch complete! Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (551, 122, 622, 233)
Cropping 64 frames to region: (536, 107) -> (637, 248)
Saving video: cropped_video_20251205_113409.mp4 (101x141 @ 25 FPS)
✓ Video saved successfully!
  File size: 78.88 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 35.13it/s]

Predicted: tensor([2], device='cuda:0')
File: cropped_video_20251205_113409.mp4                  -> Predicted Class: HammerCurls

Batch complete! Waiting for next batch...

✓ Frame 3/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (559, 124, 623, 233)
Cropping 64 frames to region: (544, 109) -> (638, 248)
Saving video: cropped_video_20251205_113412.mp4 (94x139 @ 25 FPS)
✓ Video saved successfully!
  File size: 66.38 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 36.51it/s]

Predicted: tensor([13], device='cuda:0')
File: cropped_video_20251205_113412.mp4                  -> Predicted Class: db_seated_hammercurl

Batch complete! Waiting for next batch...

✓ Frame 2/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detect

Inference: 100%|██████████| 1/1 [00:00<00:00, 27.27it/s]

Predicted: tensor([2], device='cuda:0')
File: cropped_video_20251205_113530.mp4                  -> Predicted Class: HammerCurls

Batch complete! Waiting for next batch...

✓ Frame 2/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (243, 92, 353, 464)
Cropping 64 frames to region: (228, 77) -> (368, 479)
Saving video: cropped_video_20251205_113533.mp4 (140x402 @ 25 FPS)
✓ Video saved successfully!
  File size: 165.50 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.71it/s]

Predicted: tensor([2], device='cuda:0')
File: cropped_video_20251205_113533.mp4                  -> Predicted Class: HammerCurls

Batch complete! Waiting for next batch...

✓ Frame 2/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (529, 114, 575, 204)
Cropping 64 frames to region: (514, 99) -> (590, 219)
Saving video: cropped_video_20251205_113546.mp4 (76x120 @ 25 FPS)
✓ Video saved successfully!
  File size: 28.38 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 36.78it/s]

Predicted: tensor([1], device='cuda:0')
File: cropped_video_20251205_113546.mp4                  -> Predicted Class: FrontRaises

Batch complete! Waiting for next batch...

✓ Frame 4/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (526, 115, 574, 203)
Cropping 64 frames to region: (511, 100) -> (589, 218)
Saving video: cropped_video_20251205_113552.mp4 (78x118 @ 25 FPS)
✓ Video saved successfully!
  File size: 31.49 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 36.69it/s]

Predicted: tensor([15], device='cuda:0')
File: cropped_video_20251205_113552.mp4                  -> Predicted Class: dumbbell_incline_chest_press

Batch complete! Waiting for next batch...

✓ Frame 4/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (526, 115, 574, 205)
Cropping 64 frames to region: (511, 100) -> (589, 220)
Saving video: cropped_video_20251205_113557.mp4 (78x120 @ 25 FPS)
✓ Video saved successfully!
  File size: 30.92 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 36.82it/s]

Predicted: tensor([1], device='cuda:0')
File: cropped_video_20251205_113557.mp4                  -> Predicted Class: FrontRaises

Batch complete! Waiting for next batch...

✓ Frame 5/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (526, 115, 574, 203)
Cropping 64 frames to region: (511, 100) -> (589, 218)
Saving video: cropped_video_20251205_113605.mp4 (78x118 @ 25 FPS)
✓ Video saved successfully!
  File size: 33.02 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 36.59it/s]

Predicted: tensor([15], device='cuda:0')
File: cropped_video_20251205_113605.mp4                  -> Predicted Class: dumbbell_incline_chest_press

Batch complete! Waiting for next batch...

✓ Frame 2/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.
✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (275, 76, 371, 417)
Cropping 64 frames to region: (260, 61) -> (386, 432)
Saving video: cropped_video_20251205_113613.mp4 (126x371 @ 25 FPS)
✓ Video saved successfully!
  File size: 240.55 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 28.62it/s]

Predicted: tensor([9], device='cuda:0')
File: cropped_video_20251205_113613.mp4                  -> Predicted Class: chest_rows

Batch complete! Waiting for next batch...

✓ Frame 3/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (301, 71, 343, 245)
Cropping 64 frames to region: (286, 56) -> (358, 260)
Saving video: cropped_video_20251205_113616.mp4 (72x204 @ 25 FPS)
✓ Video saved successfully!
  File size: 70.04 KB
  Frames in video: 64

Running action recognition...


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Failed to load video: /home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames/cropped_video_20251205_113616.mp4
Error: [11:36:16] /github/workspace/src/runtime/ndarray.cc:171: Check failed: from_size == to_size (58752 vs. 44064) DECORDArrayCopyFromTo: The size must exactly match
Failed to load video: /home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames/cropped_video_20251205_113616.mp4
Error: [11:36:16] /github/workspace/src/runtime/ndarray.cc:171: Check failed: from_size == to_size (58752 vs. 44064) DECORDArrayCopyFromTo: The size must exactly match
Failed to load video: /home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames/cropped_video_20251205_113616.mp4
Error: [11:36:16] /github/workspace/src/runtime/ndarray.cc:171: Check failed: from_size == to_size (58752 vs. 44064) DECORDArrayCopyFromTo: The size must exactly match
Failed to load video: /home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames/cr

: 

In [None]:
import time
import cv2
import os
from camera import CameraManager, create_camera_configs_from_ips
import torch
import torch.nn as nn
from torchvision.models.video import mvit_v1_b, MViT_V1_B_Weights
import mlflow
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop
from decord import VideoReader, cpu
from tqdm import tqdm
from ultralytics import YOLO
import numpy as np
import gc

# Clear GPU memory
torch.cuda.empty_cache()

# ============================
# MLflow Setup
# ============================
mlflow.set_tracking_uri("file:///home/smartan5070/Downloads/SlowfastTrainer-main/mlruns")


def clamp_bbox(bbox, h, w):
    """Clamp bounding box coordinates to ensure they are within image boundaries."""
    x1, y1, x2, y2 = bbox
    x1 = max(0, min(w - 1, int(x1)))
    y1 = max(0, min(h - 1, int(y1)))
    x2 = max(0, min(w, int(x2)))
    y2 = max(0, min(h, int(y2)))
    
    if x2 <= x1 or y2 <= y1:
        return None
    return x1, y1, x2, y2


class NormalizeVideo(nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        self.mean = torch.tensor(mean).view(3, 1, 1, 1)
        self.std = torch.tensor(std).view(3, 1, 1, 1)
    
    def forward(self, tensor):
        return (tensor - self.mean) / self.std


def load_model_from_mlflow(run_id, device):
    """Try to load model from MLflow"""
    try:
        artifact_names = ["model", "pytorch_model", "mvit_model", "best_model"]
        
        for artifact_name in artifact_names:
            try:
                model_uri = f"runs:/{run_id}/{artifact_name}"
                print(f"Trying to load from: {model_uri}")
                model = mlflow.pytorch.load_model(model_uri)
                model.to(device)
                model.eval()
                print(f"✓ Model loaded successfully from MLflow artifact '{artifact_name}'!")
                return model
            except Exception as e:
                print(f"  Failed with artifact name '{artifact_name}': {str(e)[:100]}")
                continue
        
        return None
        
    except Exception as e:
        print(f"MLflow loading failed: {e}")
        return None


def load_model_from_local(model_path, num_classes, K, device):
    """Load model from local .pt file"""
    try:
        print(f"Loading model from local file: {model_path}")
        
        weights = MViT_V1_B_Weights.DEFAULT
        model = mvit_v1_b(weights=weights)
        
        for param in model.parameters():
            param.requires_grad = False
        
        last_fc_layer = model.head[-1]
        in_features = last_fc_layer.in_features
        model.head[-1] = nn.Linear(in_features, num_classes)
        
        blocks = list(model.blocks)
        for block in blocks[-K:]:
            for p in block.parameters():
                p.requires_grad = True
        
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        
        model.to(device)
        model.eval()
        print("✓ Model loaded successfully from local file!")
        return model
        
    except Exception as e:
        print(f"Failed to load from local file: {e}")
        return None


def run_mvit_inference(video_path, model, device):
    """Performs inference on the given video using a pre-loaded model."""
    transform = Compose([
        Resize((256, 256)),
        CenterCrop(224),
        NormalizeVideo([0.45, 0.45, 0.45], [0.225, 0.225, 0.225])
    ])
    
    frames_per_clip = 16
    test_dataset = FlatVideoDataset(video_path, transform=transform, frames_per_clip=frames_per_clip)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, 
                            pin_memory=(device.type == 'cuda'))
    
    video_predictions = run_inference(test_loader, model, device)
    
    train_root = "/home/smartan5070/Downloads/SlowfastTrainer-main/Dataset_30Classes_Cam107-18_SPLIT/train"
    class_names = sorted(os.listdir(train_root))
    idx_to_class_name = {i: name for i, name in enumerate(class_names)}
    
    for path, prediction_idx in video_predictions.items():
        predicted_class = idx_to_class_name.get(prediction_idx, f"UNKNOWN_INDEX_{prediction_idx}")
        print(f"File: {os.path.basename(path):<50} -> Predicted Class: {predicted_class}")
    
    # Clear memory after inference
    del test_dataset, test_loader, video_predictions
    torch.cuda.empty_cache()


class FlatVideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None, frames_per_clip=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.video_paths = []
        self._build_index()

    def _build_index(self):
        if os.path.isfile(self.root_dir) and self.root_dir.lower().endswith(".mp4"):
            self.video_paths.append(self.root_dir)
        elif os.path.isdir(self.root_dir):
            for fname in os.listdir(self.root_dir):
                if fname.lower().endswith(".mp4"):
                    self.video_paths.append(os.path.join(self.root_dir, fname))
        else:
            raise ValueError(f"{self.root_dir} is neither a .mp4 file nor a directory")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = -1
        
        try:
            vr = VideoReader(path, ctx=cpu(0))
            total_frames = len(vr)

            if total_frames < self.frames_per_clip:
                base = np.linspace(0, total_frames - 1, total_frames).astype(int)
                pad = self.frames_per_clip - total_frames
                frame_indices = np.concatenate([base, np.full((pad,), base[-1], dtype=int)])
            else:
                frame_indices = np.linspace(0, total_frames - 1, self.frames_per_clip).astype(int)

            frames = vr.get_batch(frame_indices).asnumpy()

            if frames.shape[-1] == 1:
                frames = np.repeat(frames, 3, axis=-1)
            elif frames.shape[-1] != 3:
                raise ValueError(f"Unsupported channel count: {frames.shape[-1]} in video {path}")

            frames = torch.from_numpy(frames).permute(3, 0, 1, 2).float() / 255.0

            if self.transform:
                frames = self.transform(frames)

            return frames, label, path

        except Exception as e:
            print(f"Failed to load video: {path}\nError: {e}")
            return self.__getitem__((idx + 1) % len(self))


def run_inference(test_loader, model, device):
    all_predictions = []
    all_video_paths = []
    
    model.eval()
    with torch.no_grad():
        for inputs, _, paths in tqdm(test_loader, desc="Inference"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            print(f"Predicted: {predicted}")
            all_predictions.extend(predicted.cpu().numpy().tolist())
            all_video_paths.extend(paths)
            
            # Clear batch from GPU
            del inputs, outputs
    
    return dict(zip(all_video_paths, all_predictions))


def clear_memory():
    """Explicitly clear memory"""
    gc.collect()
    torch.cuda.empty_cache()


# Camera IPs and configurations
camera_ips = ["192.168.0.101"]
camera_configs = create_camera_configs_from_ips(camera_ips)

# Initialize camera manager
manager = CameraManager(display_width=640, display_height=480)

# Add cameras
for i, config in enumerate(camera_configs):
    camera_id = f"cam_{i+1}"
    manager.add_camera(camera_id, config)

print(f"Created {manager.get_camera_count()} cameras")
for camera_id in manager.get_camera_ids():
    camera = manager.get_camera(camera_id)
    print(f"  {camera_id}: {camera.get_name()} - {camera.get_config()['url']}")

# Define output folder path
output_folder = '/home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames'
os.makedirs(output_folder, exist_ok=True)

# Start cameras
manager.start_all_cameras()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# LOAD MODEL WITH FALLBACK
# ============================
print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)

run_id = "840f3d39813c41ba9880859c83a82b01"
local_model_path = "/home/smartan5070/Downloads/SlowfastTrainer-main/Models/Testing_2Classes_Cam10718/Testing_21_acc_98_MViT.pt"
num_classes = 21
K = 3

print("\n1. Trying to load from MLflow...")
mvit_model = load_model_from_mlflow(run_id, device)

if mvit_model is None:
    print("\n2. MLflow failed, trying local .pt file...")
    mvit_model = load_model_from_local(local_model_path, num_classes, K, device)

if mvit_model is None:
    print("\n✗ FAILED TO LOAD MODEL!")
    print("Please check:")
    print("1. MLflow run ID is correct")
    print("2. Local .pt file exists at:", local_model_path)
    print("3. num_classes and K parameters match your training setup")
    exit(1)

print("\n✓ Model loaded and ready!")
print("="*60 + "\n")

try:
    frame_batch = []
    BATCH_SIZE = 64
    model_yolo = YOLO("yolov8n.pt").to(device)
    
    # Frame rate control
    TARGET_FPS = 25
    frame_interval = 1.0 / TARGET_FPS
    last_frame_hash = None
    
    print(f"Starting frame collection (Target: {BATCH_SIZE} frames at {TARGET_FPS} FPS)")
    print(f"Estimated collection time: {BATCH_SIZE/TARGET_FPS:.1f} seconds")
    print("-" * 60)

    while True:
        # Get current frame from camera
        frames = manager.get_frames()
        
        frame_added = False
        for cam_id, frame in frames.items():
            if frame is not None:
                # Create a simple hash to check if frame is different
                current_hash = hash(frame.tobytes())
                
                # Only add if it's a different frame
                if current_hash != last_frame_hash:
                    frame_batch.append(frame.copy())
                    last_frame_hash = current_hash
                    frame_added = True
                    print(f"✓ Frame {len(frame_batch)}/{BATCH_SIZE} collected", end='\r')
                
                break  # Only one camera
        
        # Control frame rate - wait for next frame
        if frame_added:
            time.sleep(frame_interval)
        else:
            time.sleep(0.001)

        # Process batch when we have enough frames
        if len(frame_batch) >= BATCH_SIZE:
            print(f"\n\n{'='*60}")
            print(f"Processing batch of {len(frame_batch)} frames...")
            print('='*60)

            # Variables to track for cleanup
            cropped_batch = None
            writer = None
            cap = None
            
            try:
                # STEP 1 — DETECT PERSON IN FIRST FRAME
                first_frame = frame_batch[0]
                results = model_yolo(first_frame, conf=0.5, verbose=False)
                r = results[0]

                best_box = None
                max_area = 0
                frame_h, frame_w = first_frame.shape[:2]

                for box in r.boxes:
                    if int(box.cls.item()) == 0:  # person class
                        xyxy = box.xyxy[0].cpu().numpy()
                        x1, y1, x2, y2 = xyxy

                        clamped = clamp_bbox((x1, y1, x2, y2), frame_h, frame_w)
                        if not clamped:
                            continue

                        x1, y1, x2, y2 = clamped
                        area = (x2 - x1) * (y2 - y1)

                        if area > max_area:
                            max_area = area
                            best_box = (int(x1), int(y1), int(x2), int(y2))

                # Clear YOLO results from GPU
                del results, r
                torch.cuda.empty_cache()

                # STEP 2 — IF PERSON FOUND, SET CROP BOX
                fixed_crop_area = None
                if best_box is not None:
                    x1, y1, x2, y2 = best_box
                    PADDING = 15

                    padded_box = (
                        max(0, x1 - PADDING),
                        max(0, y1 - PADDING),
                        min(frame_w, x2 + PADDING),
                        min(frame_h, y2 + PADDING)
                    )
                    fixed_crop_area = padded_box
                    print(f"✓ Person detected at: {best_box}")
                else:
                    print("✗ No person detected in first frame")

                # STEP 3 — If no crop area → skip this batch
                if fixed_crop_area is None:
                    print("✗ No person detected in this batch. Skipping and resetting.")
                    frame_batch.clear()
                    last_frame_hash = None
                    clear_memory()
                    continue

                # STEP 4 — CROP ALL FRAMES
                x1, y1, x2, y2 = fixed_crop_area
                cropped_batch = []

                print(f"Cropping {len(frame_batch)} frames to region: ({x1}, {y1}) -> ({x2}, {y2})")
                for frame in frame_batch:
                    crop = frame[y1:y2, x1:x2]
                    cropped_batch.append(crop)

                # STEP 5 — SAVE VIDEO PROPERLY
                timestamp = time.strftime("%Y%m%d_%H%M%S")
                video_filename = f"cropped_video_{timestamp}.mp4"
                video_path = os.path.join(output_folder, video_filename)

                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                H, W = cropped_batch[0].shape[:2]
                
                print(f"Saving video: {video_filename} ({W}x{H} @ {TARGET_FPS} FPS)")
                writer = cv2.VideoWriter(video_path, fourcc, TARGET_FPS, (W, H))

                for f in cropped_batch:
                    writer.write(f)

                writer.release()
                writer = None
                print(f"✓ Video saved successfully!")

                # Verify video was saved correctly
                if os.path.exists(video_path):
                    file_size = os.path.getsize(video_path) / 1024
                    print(f"  File size: {file_size:.2f} KB")
                    
                    cap = cv2.VideoCapture(video_path)
                    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                    cap.release()
                    cap = None
                    print(f"  Frames in video: {total_frames}")

                # STEP 6 — RUN ACTION RECOGNITION
                print("\nRunning action recognition...")
                run_mvit_inference(video_path, mvit_model, device)

            except Exception as e:
                print(f"\n✗ Error during processing: {e}")
                import traceback
                traceback.print_exc()
            
            finally:
                # CRITICAL: Always clean up, even if error occurs
                if writer is not None:
                    writer.release()
                if cap is not None:
                    cap.release()
                
                # Clear all batch data
                frame_batch.clear()
                if cropped_batch is not None:
                    cropped_batch.clear()
                    del cropped_batch
                
                last_frame_hash = None
                
                # Force garbage collection and GPU memory clear
                clear_memory()
                
                print(f"\n{'='*60}")
                print("Batch complete! Memory cleared. Waiting for next batch...")
                print(f"{'='*60}\n")

        # Quit with 'q' key
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("\nQuitting...")
            break

finally:
    manager.stop_all_cameras()
    cv2.destroyAllWindows()
    clear_memory()
    print("Cleanup complete.")

Added camera cam_1: Camera 1
Created 1 cameras
  cam_1: Camera 1 - rtsp://admin:admin%40123@192.168.0.101:554/stream1


  return FileStore(store_uri, store_uri)
  from .autonotebook import tqdm as notebook_tqdm


Started camera cam_1
Started 1/1 cameras

LOADING MODEL

1. Trying to load from MLflow...
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'model': Failed to download artifacts from path 'model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/pytorch_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'pytorch_model': Failed to download artifacts from path 'pytorch_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/mvit_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'mvit_model': Failed to download artifacts from path 'mvit_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/best_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 180.18it/s]  


✓ Model loaded successfully from MLflow artifact 'best_model'!

✓ Model loaded and ready!

Starting frame collection (Target: 64 frames at 25 FPS)
Estimated collection time: 2.6 seconds
------------------------------------------------------------
✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detec

Inference: 100%|██████████| 1/1 [00:00<00:00, 19.66it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_124001.mp4                  -> Predicted Class: BicepsCurls






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (209, 83, 429, 441)
Cropping 64 frames to region: (194, 68) -> (444, 456)
Saving video: cropped_video_20251205_124004.mp4 (250x388 @ 25 FPS)
✓ Video saved successfully!
  File size: 384.13 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 26.49it/s]

Predicted: tensor([3], device='cuda:0')
File: cropped_video_20251205_124004.mp4                  -> Predicted Class: LateralRaise






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (263, 84, 377, 443)
Cropping 64 frames to region: (248, 69) -> (392, 458)
Saving video: cropped_video_20251205_124007.mp4 (144x389 @ 25 FPS)
✓ Video saved successfully!
  File size: 246.14 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 31.64it/s]


Predicted: tensor([3], device='cuda:0')
File: cropped_video_20251205_124007.mp4                  -> Predicted Class: LateralRaise

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (273, 86, 383, 439)
Cropping 64 frames to region: (258, 71) -> (398, 454)
Saving video: cropped_video_20251205_124010.mp4 (140x383 @ 25 FPS)
✓ Video saved successfully!
  File size: 311.44 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 31.09it/s]


Predicted: tensor([2], device='cuda:0')
File: cropped_video_20251205_124010.mp4                  -> Predicted Class: HammerCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (271, 212, 392, 478)
Cropping 64 frames to region: (256, 197) -> (407, 480)
Saving video: cropped_video_20251205_124013.mp4 (151x283 @ 25 FPS)
✓ Video saved successfully!
  File size: 255.98 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 33.56it/s]

Predicted: tensor([9], device='cuda:0')
File: cropped_video_20251205_124013.mp4                  -> Predicted Class: chest_rows






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (275, 116, 403, 477)
Cropping 64 frames to region: (260, 101) -> (418, 480)
Saving video: cropped_video_20251205_124016.mp4 (158x379 @ 25 FPS)
✓ Video saved successfully!
  File size: 303.26 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 29.42it/s]

Predicted: tensor([20], device='cuda:0')
File: cropped_video_20251205_124016.mp4                  -> Predicted Class: kb_ohpress






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (270, 160, 395, 480)
Cropping 64 frames to region: (255, 145) -> (410, 480)
Saving video: cropped_video_20251205_124019.mp4 (155x335 @ 25 FPS)
✓ Video saved successfully!
  File size: 255.14 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 29.72it/s]

Predicted: tensor([10], device='cuda:0')
File: cropped_video_20251205_124019.mp4                  -> Predicted Class: concentration_curls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✓ Person detected at: (553, 146, 605, 231)
Cropping 64 frames to region: (538, 131) -> (620, 246)
Saving video: cropped_video_20251205_124022.mp4 (82x115 @ 25 FPS)
✓ Video saved successfully!
  File size: 36.81 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 37.42it/s]

Predicted: tensor([17], device='cuda:0')
File: cropped_video_20251205_124022.mp4                  -> Predicted Class: dumbbell_reverse_flys

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
✗ No person detected in first frame
✗ No person detected in this batch. Skipping and resetting.

Batch complete! Memory cleared. Waiting for next batch...

All cameras stoppedcted
Cleanup complete.


KeyboardInterrupt: 

In [1]:
import time
import cv2
import os
from camera import CameraManager, create_camera_configs_from_ips
import torch
import torch.nn as nn
from torchvision.models.video import mvit_v1_b, MViT_V1_B_Weights
import mlflow
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop
from decord import VideoReader, cpu
from tqdm import tqdm
from ultralytics import YOLO
import numpy as np
import gc

# Clear GPU memory
torch.cuda.empty_cache()

# ============================
# MLflow Setup
# ============================
mlflow.set_tracking_uri("file:///home/smartan5070/Downloads/SlowfastTrainer-main/mlruns")


def clamp_bbox(bbox, h, w):
    """Clamp bounding box coordinates to ensure they are within image boundaries."""
    x1, y1, x2, y2 = bbox
    x1 = max(0, min(w - 1, int(x1)))
    y1 = max(0, min(h - 1, int(y1)))
    x2 = max(0, min(w, int(x2)))
    y2 = max(0, min(h, int(y2)))
    
    if x2 <= x1 or y2 <= y1:
        return None
    return x1, y1, x2, y2


class NormalizeVideo(nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        self.mean = torch.tensor(mean).view(3, 1, 1, 1)
        self.std = torch.tensor(std).view(3, 1, 1, 1)
    
    def forward(self, tensor):
        return (tensor - self.mean) / self.std


def load_model_from_mlflow(run_id, device):
    """Try to load model from MLflow"""
    try:
        artifact_names = ["model", "pytorch_model", "mvit_model", "best_model"]
        
        for artifact_name in artifact_names:
            try:
                model_uri = f"runs:/{run_id}/{artifact_name}"
                print(f"Trying to load from: {model_uri}")
                model = mlflow.pytorch.load_model(model_uri)
                model.to(device)
                model.eval()
                print(f"✓ Model loaded successfully from MLflow artifact '{artifact_name}'!")
                return model
            except Exception as e:
                print(f"  Failed with artifact name '{artifact_name}': {str(e)[:100]}")
                continue
        
        return None
        
    except Exception as e:
        print(f"MLflow loading failed: {e}")
        return None


def load_model_from_local(model_path, num_classes, K, device):
    """Load model from local .pt file"""
    try:
        print(f"Loading model from local file: {model_path}")
        
        weights = MViT_V1_B_Weights.DEFAULT
        model = mvit_v1_b(weights=weights)
        
        for param in model.parameters():
            param.requires_grad = False
        
        last_fc_layer = model.head[-1]
        in_features = last_fc_layer.in_features
        model.head[-1] = nn.Linear(in_features, num_classes)
        
        blocks = list(model.blocks)
        for block in blocks[-K:]:
            for p in block.parameters():
                p.requires_grad = True
        
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        
        model.to(device)
        model.eval()
        print("✓ Model loaded successfully from local file!")
        return model
        
    except Exception as e:
        print(f"Failed to load from local file: {e}")
        return None


def run_mvit_inference(video_path, model, device):
    """Performs inference on the given video using a pre-loaded model."""
    transform = Compose([
        Resize((256, 256)),
        CenterCrop(224),
        NormalizeVideo([0.45, 0.45, 0.45], [0.225, 0.225, 0.225])
    ])
    
    frames_per_clip = 16
    test_dataset = FlatVideoDataset(video_path, transform=transform, frames_per_clip=frames_per_clip)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, 
                            pin_memory=(device.type == 'cuda'))
    
    video_predictions = run_inference(test_loader, model, device)
    
    train_root = "/home/smartan5070/Downloads/SlowfastTrainer-main/Dataset_30Classes_Cam107-18_SPLIT/train"
    class_names = sorted(os.listdir(train_root))
    idx_to_class_name = {i: name for i, name in enumerate(class_names)}
    
    for path, prediction_idx in video_predictions.items():
        predicted_class = idx_to_class_name.get(prediction_idx, f"UNKNOWN_INDEX_{prediction_idx}")
        print(f"File: {os.path.basename(path):<50} -> Predicted Class: {predicted_class}")
    
    # Clear memory after inference
    del test_dataset, test_loader, video_predictions
    torch.cuda.empty_cache()


class FlatVideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None, frames_per_clip=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.video_paths = []
        self._build_index()

    def _build_index(self):
        if os.path.isfile(self.root_dir) and self.root_dir.lower().endswith(".mp4"):
            self.video_paths.append(self.root_dir)
        elif os.path.isdir(self.root_dir):
            for fname in os.listdir(self.root_dir):
                if fname.lower().endswith(".mp4"):
                    self.video_paths.append(os.path.join(self.root_dir, fname))
        else:
            raise ValueError(f"{self.root_dir} is neither a .mp4 file nor a directory")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = -1
        
        try:
            vr = VideoReader(path, ctx=cpu(0))
            total_frames = len(vr)

            if total_frames < self.frames_per_clip:
                base = np.linspace(0, total_frames - 1, total_frames).astype(int)
                pad = self.frames_per_clip - total_frames
                frame_indices = np.concatenate([base, np.full((pad,), base[-1], dtype=int)])
            else:
                frame_indices = np.linspace(0, total_frames - 1, self.frames_per_clip).astype(int)

            frames = vr.get_batch(frame_indices).asnumpy()

            if frames.shape[-1] == 1:
                frames = np.repeat(frames, 3, axis=-1)
            elif frames.shape[-1] != 3:
                raise ValueError(f"Unsupported channel count: {frames.shape[-1]} in video {path}")

            frames = torch.from_numpy(frames).permute(3, 0, 1, 2).float() / 255.0

            if self.transform:
                frames = self.transform(frames)

            return frames, label, path

        except Exception as e:
            print(f"Failed to load video: {path}\nError: {e}")
            return self.__getitem__((idx + 1) % len(self))


def run_inference(test_loader, model, device):
    all_predictions = []
    all_video_paths = []
    
    model.eval()
    with torch.no_grad():
        for inputs, _, paths in tqdm(test_loader, desc="Inference"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            print(f"Predicted: {predicted}")
            all_predictions.extend(predicted.cpu().numpy().tolist())
            all_video_paths.extend(paths)
            
            # Clear batch from GPU
            del inputs, outputs
    
    return dict(zip(all_video_paths, all_predictions))


def clear_memory():
    """Explicitly clear memory"""
    gc.collect()
    torch.cuda.empty_cache()


# Camera IPs and configurations
camera_ips = ["192.168.0.101"]
camera_configs = create_camera_configs_from_ips(camera_ips)

# Initialize camera manager
manager = CameraManager(display_width=640, display_height=480)

# Add cameras
for i, config in enumerate(camera_configs):
    camera_id = f"cam_{i+1}"
    manager.add_camera(camera_id, config)

print(f"Created {manager.get_camera_count()} cameras")
for camera_id in manager.get_camera_ids():
    camera = manager.get_camera(camera_id)
    print(f"  {camera_id}: {camera.get_name()} - {camera.get_config()['url']}")

# Define output folder path
output_folder = '/home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames'
os.makedirs(output_folder, exist_ok=True)

# Start cameras
manager.start_all_cameras()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# LOAD MODEL WITH FALLBACK
# ============================
print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)

run_id = "840f3d39813c41ba9880859c83a82b01"
local_model_path = "/home/smartan5070/Downloads/SlowfastTrainer-main/Models/Testing_2Classes_Cam10718/Testing_21_acc_98_MViT.pt"
num_classes = 21
K = 3

print("\n1. Trying to load from MLflow...")
mvit_model = load_model_from_mlflow(run_id, device)

if mvit_model is None:
    print("\n2. MLflow failed, trying local .pt file...")
    mvit_model = load_model_from_local(local_model_path, num_classes, K, device)

if mvit_model is None:
    print("\n✗ FAILED TO LOAD MODEL!")
    print("Please check:")
    print("1. MLflow run ID is correct")
    print("2. Local .pt file exists at:", local_model_path)
    print("3. num_classes and K parameters match your training setup")
    exit(1)

print("\n✓ Model loaded and ready!")
print("="*60 + "\n")

try:
    frame_batch = []
    BATCH_SIZE = 64
    model_yolo = YOLO("yolov8n.pt").to(device)
    
    # Frame rate control
    TARGET_FPS = 25
    frame_interval = 1.0 / TARGET_FPS
    last_frame_hash = None
    
    print(f"Starting frame collection (Target: {BATCH_SIZE} frames at {TARGET_FPS} FPS)")
    print(f"Estimated collection time: {BATCH_SIZE/TARGET_FPS:.1f} seconds")
    print("-" * 60)

    while True:
        # Get current frame from camera
        frames = manager.get_frames()
        
        frame_added = False
        for cam_id, frame in frames.items():
            if frame is not None:
                # Create a simple hash to check if frame is different
                current_hash = hash(frame.tobytes())
                
                # Only add if it's a different frame
                if current_hash != last_frame_hash:
                    frame_batch.append(frame.copy())
                    last_frame_hash = current_hash
                    frame_added = True
                    print(f"✓ Frame {len(frame_batch)}/{BATCH_SIZE} collected", end='\r')
                
                break  # Only one camera
        
        # Control frame rate - wait for next frame
        if frame_added:
            time.sleep(frame_interval)
        else:
            time.sleep(0.001)

        # Process batch when we have enough frames
        if len(frame_batch) >= BATCH_SIZE:
            print(f"\n\n{'='*60}")
            print(f"Processing batch of {len(frame_batch)} frames...")
            print('='*60)

            # Variables to track for cleanup
            cropped_batch = None
            writer = None
            cap = None
            all_boxes = None
            
            try:
                # STEP 1 — DETECT PERSON IN ALL FRAMES
                all_boxes = []
                print("Detecting person in all frames...")

                for idx, frame in enumerate(frame_batch):
                    results = model_yolo(frame, conf=0.5, verbose=False)
                    r = results[0]
                    best_box = None
                    max_area = 0
                    h, w = frame.shape[:2]

                    for box in r.boxes:
                        if int(box.cls.item()) == 0:  # person class
                            xyxy = box.xyxy[0].cpu().numpy()
                            x1, y1, x2, y2 = xyxy
                            clamped = clamp_bbox((x1, y1, x2, y2), h, w)
                            if not clamped:
                                continue 
                            x1, y1, x2, y2 = clamped

                            area = (x2 - x1) * (y2 - y1)

                            if area > max_area:
                                max_area = area
                                best_box = (int(x1), int(y1), int(x2), int(y2))
                    
                    all_boxes.append(best_box)
                    
                    # Clear results for this frame
                    del results, r
                
                print(f"✓ Detection complete: {sum(1 for b in all_boxes if b is not None)}/{len(all_boxes)} frames with person detected")
                torch.cuda.empty_cache()

                # STEP 2 — CROP EACH FRAME WITH ITS OWN BOUNDING BOX
                cropped_batch = []
                PADDING = 15
                
                print("Cropping frames with individual bounding boxes...")
                for frame, box in zip(frame_batch, all_boxes):
                    if box is None:
                        # No detection: reuse last cropped frame if available
                        if len(cropped_batch) > 0:
                            cropped_batch.append(cropped_batch[-1].copy())
                        else:
                            # Skip this frame if no previous crop exists
                            continue
                    else:
                        # Crop with detected box
                        x1, y1, x2, y2 = box
                        H, W = frame.shape[:2]

                        x1 = max(0, x1 - PADDING)
                        y1 = max(0, y1 - PADDING)
                        x2 = min(W, x2 + PADDING)
                        y2 = min(H, y2 + PADDING)
                        
                        crop = frame[y1:y2, x1:x2]
                        cropped_batch.append(crop)

                # STEP 3 — Check if we have any cropped frames
                if len(cropped_batch) == 0:
                    print("✗ No person detected in any frame. Skipping batch.")
                    frame_batch.clear()
                    last_frame_hash = None
                    clear_memory()
                    continue

                print(f"✓ Cropped {len(cropped_batch)} frames")

                # STEP 4 — SAVE VIDEO
                # Find most common crop size to standardize
                crop_sizes = {}
                for crop in cropped_batch:
                    h, w = crop.shape[:2]
                    size = (w, h)
                    crop_sizes[size] = crop_sizes.get(size, 0) + 1
                
                # Use most common size
                target_size = max(crop_sizes.items(), key=lambda x: x[1])[0]
                W, H = target_size
                
                # Resize all crops to target size
                resized_batch = []
                for crop in cropped_batch:
                    if crop.shape[:2] != (H, W):
                        resized = cv2.resize(crop, (W, H))
                        resized_batch.append(resized)
                    else:
                        resized_batch.append(crop)

                timestamp = time.strftime("%Y%m%d_%H%M%S")
                video_filename = f"cropped_video_{timestamp}.mp4"
                video_path = os.path.join(output_folder, video_filename)

                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                
                print(f"Saving video: {video_filename} ({W}x{H} @ {TARGET_FPS} FPS)")
                writer = cv2.VideoWriter(video_path, fourcc, TARGET_FPS, (W, H))

                for f in resized_batch:
                    writer.write(f)

                writer.release()
                writer = None
                print(f"✓ Video saved successfully!")

                # Verify video was saved correctly
                if os.path.exists(video_path):
                    file_size = os.path.getsize(video_path) / 1024
                    print(f"  File size: {file_size:.2f} KB")
                    
                    cap = cv2.VideoCapture(video_path)
                    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                    cap.release()
                    cap = None
                    print(f"  Frames in video: {total_frames}")

                # STEP 5 — RUN ACTION RECOGNITION
                print("\nRunning action recognition...")
                run_mvit_inference(video_path, mvit_model, device)

            except Exception as e:
                print(f"\n✗ Error during processing: {e}")
                import traceback
                traceback.print_exc()
            
            finally:
                # CRITICAL: Always clean up, even if error occurs
                if writer is not None:
                    writer.release()
                if cap is not None:
                    cap.release()
                
                # Clear all batch data
                frame_batch.clear()
                if cropped_batch is not None:
                    cropped_batch.clear()
                    del cropped_batch
                if all_boxes is not None:
                    all_boxes.clear()
                    del all_boxes
                
                last_frame_hash = None
                
                # Force garbage collection and GPU memory clear
                clear_memory()
                
                print(f"\n{'='*60}")
                print("Batch complete! Memory cleared. Waiting for next batch...")
                print(f"{'='*60}\n")

        # Quit with 'q' key
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("\nQuitting...")
            break

finally:
    manager.stop_all_cameras()
    cv2.destroyAllWindows()
    clear_memory()
    print("Cleanup complete.")

Added camera cam_1: Camera 1
Created 1 cameras
  cam_1: Camera 1 - rtsp://admin:admin%40123@192.168.0.101:554/stream1


  return FileStore(store_uri, store_uri)
  from .autonotebook import tqdm as notebook_tqdm


Started camera cam_1
Started 1/1 cameras

LOADING MODEL

1. Trying to load from MLflow...
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'model': Failed to download artifacts from path 'model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/pytorch_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'pytorch_model': Failed to download artifacts from path 'pytorch_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/mvit_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'mvit_model': Failed to download artifacts from path 'mvit_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/best_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 170.76it/s]  


✓ Model loaded successfully from MLflow artifact 'best_model'!

✓ Model loaded and ready!

Starting frame collection (Target: 64 frames at 25 FPS)
Estimated collection time: 2.6 seconds
------------------------------------------------------------
✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 0/64 frames with person detected
Cropping frames with individual bounding boxes...
✗ No person detected in any frame. Skipping batch.

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 0/64 frames with person detected
Cropping frames with individual bounding boxes...
✗ No person detected in any frame. Skipping batch.

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 0/64 frames with person

Inference: 100%|██████████| 1/1 [00:00<00:00, 23.36it/s]

Predicted: tensor([2], device='cuda:0')
File: cropped_video_20251205_161130.mp4                  -> Predicted Class: HammerCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161133.mp4 (159x383 @ 25 FPS)
✓ Video saved successfully!
  File size: 403.17 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 26.41it/s]

Predicted: tensor([4], device='cuda:0')
File: cropped_video_20251205_161133.mp4                  -> Predicted Class: UprightRows






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161136.mp4 (132x328 @ 25 FPS)
✓ Video saved successfully!
  File size: 320.62 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.55it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161136.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161139.mp4 (133x326 @ 25 FPS)
✓ Video saved successfully!
  File size: 307.09 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 29.25it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161139.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161142.mp4 (216x327 @ 25 FPS)
✓ Video saved successfully!
  File size: 401.02 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.46it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161142.mp4                  -> Predicted Class: BicepsCurls






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161145.mp4 (224x328 @ 25 FPS)
✓ Video saved successfully!
  File size: 401.06 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.73it/s]

Predicted: tensor([3], device='cuda:0')
File: cropped_video_20251205_161145.mp4                  -> Predicted Class: LateralRaise






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161148.mp4 (114x330 @ 25 FPS)
✓ Video saved successfully!
  File size: 319.70 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 32.07it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161148.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161151.mp4 (116x321 @ 25 FPS)
✓ Video saved successfully!
  File size: 283.64 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 34.21it/s]

Predicted: tensor([4], device='cuda:0')
File: cropped_video_20251205_161151.mp4                  -> Predicted Class: UprightRows

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161155.mp4 (151x324 @ 25 FPS)
✓ Video saved successfully!
  File size: 315.65 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.08it/s]

Predicted: tensor([4], device='cuda:0')
File: cropped_video_20251205_161155.mp4                  -> Predicted Class: UprightRows

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161158.mp4 (114x328 @ 25 FPS)
✓ Video saved successfully!
  File size: 265.07 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.12it/s]

Predicted: tensor([10], device='cuda:0')
File: cropped_video_20251205_161158.mp4                  -> Predicted Class: concentration_curls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161201.mp4 (103x318 @ 25 FPS)
✓ Video saved successfully!
  File size: 219.36 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 32.48it/s]

Predicted: tensor([10], device='cuda:0')
File: cropped_video_20251205_161201.mp4                  -> Predicted Class: concentration_curls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161204.mp4 (109x322 @ 25 FPS)
✓ Video saved successfully!
  File size: 260.86 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 32.24it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161204.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 63/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161207.mp4 (111x324 @ 25 FPS)
✓ Video saved successfully!
  File size: 181.99 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 31.10it/s]

Predicted: tensor([15], device='cuda:0')
File: cropped_video_20251205_161207.mp4                  -> Predicted Class: dumbbell_incline_chest_press

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 56/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161210.mp4 (107x214 @ 25 FPS)
✓ Video saved successfully!
  File size: 159.51 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 33.49it/s]

Predicted: tensor([15], device='cuda:0')
File: cropped_video_20251205_161210.mp4                  -> Predicted Class: dumbbell_incline_chest_press

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161213.mp4 (112x323 @ 25 FPS)
✓ Video saved successfully!
  File size: 175.42 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 31.74it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161213.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161216.mp4 (114x320 @ 25 FPS)
✓ Video saved successfully!
  File size: 245.33 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 32.17it/s]

Predicted: tensor([2], device='cuda:0')
File: cropped_video_20251205_161216.mp4                  -> Predicted Class: HammerCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161219.mp4 (116x330 @ 25 FPS)
✓ Video saved successfully!
  File size: 308.91 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.34it/s]

Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161219.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161222.mp4 (182x320 @ 25 FPS)
✓ Video saved successfully!
  File size: 350.01 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 29.72it/s]

Predicted: tensor([13], device='cuda:0')
File: cropped_video_20251205_161222.mp4                  -> Predicted Class: db_seated_hammercurl






Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161226.mp4 (182x321 @ 25 FPS)
✓ Video saved successfully!
  File size: 341.67 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 28.75it/s]

Predicted: tensor([16], device='cuda:0')
File: cropped_video_20251205_161226.mp4                  -> Predicted Class: dumbbell_lunges

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161229.mp4 (177x321 @ 25 FPS)
✓ Video saved successfully!
  File size: 315.15 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 27.94it/s]

Predicted: tensor([6], device='cuda:0')
File: cropped_video_20251205_161229.mp4                  -> Predicted Class: bb_military_press

Batch complete! Memory cleared. Waiting for next batch...

✓ Frame 1/64 collected




✓ Frame 64/64 collected

Processing batch of 64 frames...
Detecting person in all frames...
✓ Detection complete: 64/64 frames with person detected
Cropping frames with individual bounding boxes...
✓ Cropped 64 frames
Saving video: cropped_video_20251205_161232.mp4 (182x322 @ 25 FPS)
✓ Video saved successfully!
  File size: 402.45 KB
  Frames in video: 64

Running action recognition...


Inference: 100%|██████████| 1/1 [00:00<00:00, 30.14it/s]


Predicted: tensor([0], device='cuda:0')
File: cropped_video_20251205_161232.mp4                  -> Predicted Class: BicepsCurls

Batch complete! Memory cleared. Waiting for next batch...

All cameras stoppedcted
Cleanup complete.


KeyboardInterrupt: 

In [None]:
import time
import cv2
import os
from camera import CameraManager, create_camera_configs_from_ips
import torch
import torch.nn as nn
from torchvision.models.video import mvit_v1_b, MViT_V1_B_Weights
import mlflow
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop
from decord import VideoReader, cpu
from tqdm import tqdm
from ultralytics import YOLO
import numpy as np
import gc

# Clear GPU memory
torch.cuda.empty_cache()

# ============================
# MLflow Setup
# ============================
mlflow.set_tracking_uri("file:///home/smartan5070/Downloads/SlowfastTrainer-main/mlruns")


def clamp_bbox(bbox, h, w):
    """Clamp bounding box coordinates to ensure they are within image boundaries."""
    x1, y1, x2, y2 = bbox
    x1 = max(0, min(w - 1, int(x1)))
    y1 = max(0, min(h - 1, int(y1)))
    x2 = max(0, min(w, int(x2)))
    y2 = max(0, min(h, int(y2)))
    
    if x2 <= x1 or y2 <= y1:
        return None
    return x1, y1, x2, y2


class NormalizeVideo(nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        self.mean = torch.tensor(mean).view(3, 1, 1, 1)
        self.std = torch.tensor(std).view(3, 1, 1, 1)
    
    def forward(self, tensor):
        return (tensor - self.mean) / self.std


def load_model_from_mlflow(run_id, device):
    """Try to load model from MLflow"""
    try:
        artifact_names = ["model", "pytorch_model", "mvit_model", "best_model"]
        
        for artifact_name in artifact_names:
            try:
                model_uri = f"runs:/{run_id}/{artifact_name}"
                print(f"Trying to load from: {model_uri}")
                model = mlflow.pytorch.load_model(model_uri)
                model.to(device)
                model.eval()
                print(f"✓ Model loaded successfully from MLflow artifact '{artifact_name}'!")
                return model
            except Exception as e:
                print(f"  Failed with artifact name '{artifact_name}': {str(e)[:100]}")
                continue
        
        return None
        
    except Exception as e:
        print(f"MLflow loading failed: {e}")
        return None


def load_model_from_local(model_path, num_classes, K, device):
    """Load model from local .pt file"""
    try:
        print(f"Loading model from local file: {model_path}")
        
        weights = MViT_V1_B_Weights.DEFAULT
        model = mvit_v1_b(weights=weights)
        
        for param in model.parameters():
            param.requires_grad = False
        
        last_fc_layer = model.head[-1]
        in_features = last_fc_layer.in_features
        model.head[-1] = nn.Linear(in_features, num_classes)
        
        blocks = list(model.blocks)
        for block in blocks[-K:]:
            for p in block.parameters():
                p.requires_grad = True
        
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        
        model.to(device)
        model.eval()
        print("✓ Model loaded successfully from local file!")
        return model
        
    except Exception as e:
        print(f"Failed to load from local file: {e}")
        return None


def run_mvit_inference(video_path, model, device):
    """Performs inference on the given video using a pre-loaded model."""
    transform = Compose([
        Resize((256, 256)),
        CenterCrop(224),
        NormalizeVideo([0.45, 0.45, 0.45], [0.225, 0.225, 0.225])
    ])
    
    frames_per_clip = 16
    test_dataset = FlatVideoDataset(video_path, transform=transform, frames_per_clip=frames_per_clip)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, 
                            pin_memory=(device.type == 'cuda'))
    
    video_predictions = run_inference(test_loader, model, device)
    
    train_root = "/home/smartan5070/Downloads/SlowfastTrainer-main/Dataset_30Classes_Cam107-18_SPLIT/train"
    class_names = sorted(os.listdir(train_root))
    idx_to_class_name = {i: name for i, name in enumerate(class_names)}
    
    for path, prediction_idx in video_predictions.items():
        predicted_class = idx_to_class_name.get(prediction_idx, f"UNKNOWN_INDEX_{prediction_idx}")
        print(f"File: {os.path.basename(path):<50} -> Predicted Class: {predicted_class}")
    
    # Clear memory after inference
    del test_dataset, test_loader, video_predictions
    torch.cuda.empty_cache()


class FlatVideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None, frames_per_clip=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.video_paths = []
        self._build_index()

    def _build_index(self):
        if os.path.isfile(self.root_dir) and self.root_dir.lower().endswith(".mp4"):
            self.video_paths.append(self.root_dir)
        elif os.path.isdir(self.root_dir):
            for fname in os.listdir(self.root_dir):
                if fname.lower().endswith(".mp4"):
                    self.video_paths.append(os.path.join(self.root_dir, fname))
        else:
            raise ValueError(f"{self.root_dir} is neither a .mp4 file nor a directory")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = -1
        
        try:
            vr = VideoReader(path, ctx=cpu(0))
            total_frames = len(vr)

            if total_frames < self.frames_per_clip:
                base = np.linspace(0, total_frames - 1, total_frames).astype(int)
                pad = self.frames_per_clip - total_frames
                frame_indices = np.concatenate([base, np.full((pad,), base[-1], dtype=int)])
            else:
                frame_indices = np.linspace(0, total_frames - 1, self.frames_per_clip).astype(int)

            frames = vr.get_batch(frame_indices).asnumpy()

            if frames.shape[-1] == 1:
                frames = np.repeat(frames, 3, axis=-1)
            elif frames.shape[-1] != 3:
                raise ValueError(f"Unsupported channel count: {frames.shape[-1]} in video {path}")

            frames = torch.from_numpy(frames).permute(3, 0, 1, 2).float() / 255.0

            if self.transform:
                frames = self.transform(frames)

            return frames, label, path

        except Exception as e:
            print(f"Failed to load video: {path}\nError: {e}")
            return self.__getitem__((idx + 1) % len(self))


def run_inference(test_loader, model, device):
    all_predictions = []
    all_video_paths = []
    
    model.eval()
    with torch.no_grad():
        for inputs, _, paths in tqdm(test_loader, desc="Inference"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            print(f"Predicted: {predicted}")
            all_predictions.extend(predicted.cpu().numpy().tolist())
            all_video_paths.extend(paths)
            
            # Clear batch from GPU
            del inputs, outputs
    
    return dict(zip(all_video_paths, all_predictions))


def clear_memory():
    """Explicitly clear memory"""
    gc.collect()
    torch.cuda.empty_cache()


# Camera IPs and configurations
camera_ips = ["192.168.0.101"]
camera_configs = create_camera_configs_from_ips(camera_ips)

# Initialize camera manager
manager = CameraManager(display_width=640, display_height=480)

# Add cameras
for i, config in enumerate(camera_configs):
    camera_id = f"cam_{i+1}"
    manager.add_camera(camera_id, config)

print(f"Created {manager.get_camera_count()} cameras")
for camera_id in manager.get_camera_ids():
    camera = manager.get_camera(camera_id)
    print(f"  {camera_id}: {camera.get_name()} - {camera.get_config()['url']}")

# Define output folder path
output_folder = '/home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames'
os.makedirs(output_folder, exist_ok=True)

# Start cameras
manager.start_all_cameras()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# LOAD MODEL WITH FALLBACK
# ============================
print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)

run_id = "840f3d39813c41ba9880859c83a82b01"
local_model_path = "/home/smartan5070/Downloads/SlowfastTrainer-main/Models/Testing_2Classes_Cam10718/Testing_21_acc_98_MViT.pt"
num_classes = 21
K = 3

print("\n1. Trying to load from MLflow...")
mvit_model = load_model_from_mlflow(run_id, device)

if mvit_model is None:
    print("\n2. MLflow failed, trying local .pt file...")
    mvit_model = load_model_from_local(local_model_path, num_classes, K, device)

if mvit_model is None:
    print("\n✗ FAILED TO LOAD MODEL!")
    print("Please check:")
    print("1. MLflow run ID is correct")
    print("2. Local .pt file exists at:", local_model_path)
    print("3. num_classes and K parameters match your training setup")
    exit(1)

print("\n✓ Model loaded and ready!")
print("="*60 + "\n")

try:
    frame_batch = []
    BATCH_SIZE = 64
    model_yolo = YOLO("yolov8n.pt").to(device)
    
    # Frame rate control
    TARGET_FPS = 25
    frame_interval = 1.0 / TARGET_FPS
    last_frame_hash = None
    
    print(f"Starting frame collection (Target: {BATCH_SIZE} frames at {TARGET_FPS} FPS)")
    print(f"Estimated collection time: {BATCH_SIZE/TARGET_FPS:.1f} seconds")
    print("-" * 60)

    while True:
        # Get current frame from camera
        frames = manager.get_frames()
        
        frame_added = False
        for cam_id, frame in frames.items():
            if frame is not None:
                # Create a simple hash to check if frame is different
                current_hash = hash(frame.tobytes())
                
                # Only add if it's a different frame
                if current_hash != last_frame_hash:
                    frame_batch.append(frame.copy())
                    last_frame_hash = current_hash
                    frame_added = True
                    print(f"✓ Frame {len(frame_batch)}/{BATCH_SIZE} collected", end='\r')
                
                break  # Only one camera
        
        # Control frame rate - wait for next frame
        if frame_added:
            time.sleep(frame_interval)
        else:
            time.sleep(0.001)

        # Process batch when we have enough frames
        if len(frame_batch) >= BATCH_SIZE:
            print(f"\n\n{'='*60}")
            print(f"Processing batch of {len(frame_batch)} frames...")
            print('='*60)

            # Variables to track for cleanup
            cropped_batch = None
            writer = None
            cap = None
            all_boxes = None
            
            try:
                # STEP 1 — DETECT PERSON IN ALL FRAMES
                all_boxes = []
                print("Detecting person in all frames...")

                for idx, frame in enumerate(frame_batch):
                    results = model_yolo(frame, conf=0.5, verbose=False)
                    r = results[0]
                    best_box = None
                    max_area = 0
                    h, w = frame.shape[:2]

                    for box in r.boxes:
                        if int(box.cls.item()) == 0:  # person class
                            xyxy = box.xyxy[0].cpu().numpy()
                            x1, y1, x2, y2 = xyxy
                            clamped = clamp_bbox((x1, y1, x2, y2), h, w)
                            if not clamped:
                                continue 
                            x1, y1, x2, y2 = clamped

                            area = (x2 - x1) * (y2 - y1)

                            if area > max_area:
                                max_area = area
                                best_box = (int(x1), int(y1), int(x2), int(y2))
                    
                    all_boxes.append(best_box)
                    
                    # Clear results for this frame
                    del results, r
                
                print(f"✓ Detection complete: {sum(1 for b in all_boxes if b is not None)}/{len(all_boxes)} frames with person detected")
                torch.cuda.empty_cache()

                # STEP 2 — CROP EACH FRAME WITH ITS OWN BOUNDING BOX
                cropped_batch = []
                PADDING = 15
                
                print("Cropping frames with individual bounding boxes...")
                for frame, box in zip(frame_batch, all_boxes):
                    if box is None:
                        # No detection: reuse last cropped frame if available
                        if len(cropped_batch) > 0:
                            cropped_batch.append(cropped_batch[-1].copy())
                        else:
                            # Skip this frame if no previous crop exists
                            continue
                    else:
                        # Crop with detected box
                        x1, y1, x2, y2 = box
                        H, W = frame.shape[:2]

                        x1 = max(0, x1 - PADDING)
                        y1 = max(0, y1 - PADDING)
                        x2 = min(W, x2 + PADDING)
                        y2 = min(H, y2 + PADDING)
                        
                        crop = frame[y1:y2, x1:x2]
                        cropped_batch.append(crop)

                # STEP 3 — Check if we have any cropped frames
                if len(cropped_batch) == 0:
                    print("✗ No person detected in any frame. Skipping batch.")
                    frame_batch.clear()
                    last_frame_hash = None
                    clear_memory()
                    continue

                print(f"✓ Cropped {len(cropped_batch)} frames")

                # STEP 4 — SAVE VIDEO
                # Find most common crop size to standardize
                crop_sizes = {}
                for crop in cropped_batch:
                    h, w = crop.shape[:2]
                    size = (w, h)
                    crop_sizes[size] = crop_sizes.get(size, 0) + 1
                
                # Use most common size
                target_size = max(crop_sizes.items(), key=lambda x: x[1])[0]
                W, H = target_size
                
                # Resize all crops to target size
                resized_batch = []
                for crop in cropped_batch:
                    if crop.shape[:2] != (H, W):
                        resized = cv2.resize(crop, (W, H))
                        resized_batch.append(resized)
                    else:
                        resized_batch.append(crop)

                timestamp = time.strftime("%Y%m%d_%H%M%S")
                video_filename = f"cropped_video_{timestamp}.mp4"
                video_path = os.path.join(output_folder, video_filename)

                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                
                print(f"Saving video: {video_filename} ({W}x{H} @ {TARGET_FPS} FPS)")
                writer = cv2.VideoWriter(video_path, fourcc, TARGET_FPS, (W, H))

                for f in resized_batch:
                    writer.write(f)

                writer.release()
                writer = None
                print(f"✓ Video saved successfully!")

                # Verify video was saved correctly
                if os.path.exists(video_path):
                    file_size = os.path.getsize(video_path) / 1024
                    print(f"  File size: {file_size:.2f} KB")
                    
                    cap = cv2.VideoCapture(video_path)
                    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                    cap.release()
                    cap = None
                    print(f"  Frames in video: {total_frames}")

                # STEP 5 — RUN ACTION RECOGNITION
                print("\nRunning action recognition...")
                run_mvit_inference(video_path, mvit_model, device)

            except Exception as e:
                print(f"\n✗ Error during processing: {e}")
                import traceback
                traceback.print_exc()
            
            finally:
                # CRITICAL: Always clean up, even if error occurs
                if writer is not None:
                    writer.release()
                if cap is not None:
                    cap.release()
                
                # Clear all batch data
                frame_batch.clear()
                if cropped_batch is not None:
                    cropped_batch.clear()
                    del cropped_batch
                if all_boxes is not None:
                    all_boxes.clear()
                    del all_boxes
                
                last_frame_hash = None
                
                # Force garbage collection and GPU memory clear
                clear_memory()
                
                print(f"\n{'='*60}")
                print("Batch complete! Memory cleared. Waiting for next batch...")
                print(f"{'='*60}\n")

        # Quit with 'q' key
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("\nQuitting...")
            break

finally:
    manager.stop_all_cameras()
    cv2.destroyAllWindows()
    clear_memory()
    print("Cleanup complete.")

In [1]:
import time
import cv2
import os
from camera import CameraManager, create_camera_configs_from_ips
import torch
import torch.nn as nn
from torchvision.models.video import mvit_v1_b, MViT_V1_B_Weights
import mlflow
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop
from decord import VideoReader, cpu
from tqdm import tqdm
from ultralytics import YOLO
import numpy as np
import gc

# Clear GPU memory
torch.cuda.empty_cache()

# ============================
# MLflow Setup
# ============================
mlflow.set_tracking_uri("file:///home/smartan5070/Downloads/SlowfastTrainer-main/mlruns")


def clamp_bbox(bbox, h, w):
    """Clamp bounding box coordinates to ensure they are within image boundaries."""
    x1, y1, x2, y2 = bbox
    x1 = max(0, min(w - 1, int(x1)))
    y1 = max(0, min(h - 1, int(y1)))
    x2 = max(0, min(w, int(x2)))
    y2 = max(0, min(h, int(y2)))
    
    if x2 <= x1 or y2 <= y1:
        return None
    return x1, y1, x2, y2


class NormalizeVideo(nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        self.mean = torch.tensor(mean).view(3, 1, 1, 1)
        self.std = torch.tensor(std).view(3, 1, 1, 1)
    
    def forward(self, tensor):
        return (tensor - self.mean) / self.std


def load_model_from_mlflow(run_id, device):
    """Try to load model from MLflow"""
    try:
        artifact_names = ["model", "pytorch_model", "mvit_model", "best_model"]
        
        for artifact_name in artifact_names:
            try:
                model_uri = f"runs:/{run_id}/{artifact_name}"
                print(f"Trying to load from: {model_uri}")
                model = mlflow.pytorch.load_model(model_uri)
                model.to(device)
                model.eval()
                print(f"✓ Model loaded successfully from MLflow artifact '{artifact_name}'!")
                return model
            except Exception as e:
                print(f"  Failed with artifact name '{artifact_name}': {str(e)[:100]}")
                continue
        
        return None
        
    except Exception as e:
        print(f"MLflow loading failed: {e}")
        return None


def load_model_from_local(model_path, num_classes, K, device):
    """Load model from local .pt file"""
    try:
        print(f"Loading model from local file: {model_path}")
        
        weights = MViT_V1_B_Weights.DEFAULT
        model = mvit_v1_b(weights=weights)
        
        for param in model.parameters():
            param.requires_grad = False
        
        last_fc_layer = model.head[-1]
        in_features = last_fc_layer.in_features
        model.head[-1] = nn.Linear(in_features, num_classes)
        
        blocks = list(model.blocks)
        for block in blocks[-K:]:
            for p in block.parameters():
                p.requires_grad = True
        
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        
        model.to(device)
        model.eval()
        print("✓ Model loaded successfully from local file!")
        return model
        
    except Exception as e:
        print(f"Failed to load from local file: {e}")
        return None


def run_mvit_inference(video_path, model, device):
    """Performs inference on the given video using a pre-loaded model."""
    transform = Compose([
        Resize((256, 256)),
        CenterCrop(224),
        NormalizeVideo([0.45, 0.45, 0.45], [0.225, 0.225, 0.225])
    ])
    
    frames_per_clip = 16
    test_dataset = FlatVideoDataset(video_path, transform=transform, frames_per_clip=frames_per_clip)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0, 
                            pin_memory=(device.type == 'cuda'))
    
    video_predictions = run_inference(test_loader, model, device)
    
    train_root = "/home/smartan5070/Downloads/SlowfastTrainer-main/Dataset_30Classes_Cam107-18_SPLIT/train"
    class_names = sorted(os.listdir(train_root))
    idx_to_class_name = {i: name for i, name in enumerate(class_names)}
    
    predicted_class = None
    for path, prediction_idx in video_predictions.items():
        predicted_class = idx_to_class_name.get(prediction_idx, f"UNKNOWN_INDEX_{prediction_idx}")
        print(f"File: {os.path.basename(path):<50} -> Predicted Class: {predicted_class}")
    
    # Clear memory after inference
    del test_dataset, test_loader, video_predictions
    torch.cuda.empty_cache()
    
    return predicted_class


class FlatVideoDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None, frames_per_clip=None):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.video_paths = []
        self._build_index()

    def _build_index(self):
        if os.path.isfile(self.root_dir) and self.root_dir.lower().endswith(".mp4"):
            self.video_paths.append(self.root_dir)
        elif os.path.isdir(self.root_dir):
            for fname in os.listdir(self.root_dir):
                if fname.lower().endswith(".mp4"):
                    self.video_paths.append(os.path.join(self.root_dir, fname))
        else:
            raise ValueError(f"{self.root_dir} is neither a .mp4 file nor a directory")

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = -1
        
        try:
            vr = VideoReader(path, ctx=cpu(0))
            total_frames = len(vr)

            if total_frames < self.frames_per_clip:
                base = np.linspace(0, total_frames - 1, total_frames).astype(int)
                pad = self.frames_per_clip - total_frames
                frame_indices = np.concatenate([base, np.full((pad,), base[-1], dtype=int)])
            else:
                frame_indices = np.linspace(0, total_frames - 1, self.frames_per_clip).astype(int)

            frames = vr.get_batch(frame_indices).asnumpy()

            if frames.shape[-1] == 1:
                frames = np.repeat(frames, 3, axis=-1)
            elif frames.shape[-1] != 3:
                raise ValueError(f"Unsupported channel count: {frames.shape[-1]} in video {path}")

            frames = torch.from_numpy(frames).permute(3, 0, 1, 2).float() / 255.0

            if self.transform:
                frames = self.transform(frames)

            return frames, label, path

        except Exception as e:
            print(f"Failed to load video: {path}\nError: {e}")
            return self.__getitem__((idx + 1) % len(self))


def run_inference(test_loader, model, device):
    all_predictions = []
    all_video_paths = []
    
    model.eval()
    with torch.no_grad():
        for inputs, _, paths in tqdm(test_loader, desc="Inference"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            print(f"Predicted: {predicted}")
            all_predictions.extend(predicted.cpu().numpy().tolist())
            all_video_paths.extend(paths)
            
            # Clear batch from GPU
            del inputs, outputs
    
    return dict(zip(all_video_paths, all_predictions))


def clear_memory():
    """Explicitly clear memory"""
    gc.collect()
    torch.cuda.empty_cache()


# Camera IPs and configurations
camera_ips = ["192.168.0.101"]
camera_configs = create_camera_configs_from_ips(camera_ips)

# Initialize camera manager
manager = CameraManager(display_width=640, display_height=480)

# Add cameras
for i, config in enumerate(camera_configs):
    camera_id = f"cam_{i+1}"
    manager.add_camera(camera_id, config)

print(f"Created {manager.get_camera_count()} cameras")
for camera_id in manager.get_camera_ids():
    camera = manager.get_camera(camera_id)
    print(f"  {camera_id}: {camera.get_name()} - {camera.get_config()['url']}")

# Define output folder path
output_folder = '/home/smartan5070/Downloads/SlowfastTrainer-main/unseen_test/cropped_frames'
os.makedirs(output_folder, exist_ok=True)

# Start cameras
manager.start_all_cameras()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# LOAD MODEL WITH FALLBACK
# ============================
print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)

run_id = "840f3d39813c41ba9880859c83a82b01"
local_model_path = "/home/smartan5070/Downloads/SlowfastTrainer-main/Models/Testing_2Classes_Cam10718/Testing_21_acc_98_MViT.pt"
num_classes = 21
K = 3

print("\n1. Trying to load from MLflow...")
mvit_model = load_model_from_mlflow(run_id, device)

if mvit_model is None:
    print("\n2. MLflow failed, trying local .pt file...")
    mvit_model = load_model_from_local(local_model_path, num_classes, K, device)

if mvit_model is None:
    print("\n✗ FAILED TO LOAD MODEL!")
    print("Please check:")
    print("1. MLflow run ID is correct")
    print("2. Local .pt file exists at:", local_model_path)
    print("3. num_classes and K parameters match your training setup")
    exit(1)

print("\n✓ Model loaded and ready!")
print("="*60 + "\n")

# Create display window
cv2.namedWindow('Action Recognition', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Action Recognition', 800, 600)

try:
    frame_batch = []
    BATCH_SIZE = 64
    model_yolo = YOLO("yolov8n.pt").to(device)
    
    # Frame rate control
    TARGET_FPS = 25
    frame_interval = 1.0 / TARGET_FPS
    last_frame_hash = None
    
    # Variables to store detection and prediction results
    current_avg_bbox = None
    current_prediction = "Collecting frames..."
    
    print(f"Starting frame collection (Target: {BATCH_SIZE} frames at {TARGET_FPS} FPS)")
    print(f"Estimated collection time: {BATCH_SIZE/TARGET_FPS:.1f} seconds")
    print("Press 'q' to quit")
    print("-" * 60)

    while True:
        # Get current frame from camera
        frames = manager.get_frames()
        
        frame_added = False
        display_frame = None
        
        for cam_id, frame in frames.items():
            if frame is not None:
                # Store for display
                display_frame = frame.copy()
                
                # Create a simple hash to check if frame is different
                current_hash = hash(frame.tobytes())
                
                # Only add if it's a different frame
                if current_hash != last_frame_hash:
                    frame_batch.append(frame.copy())
                    last_frame_hash = current_hash
                    frame_added = True
                    print(f"✓ Frame {len(frame_batch)}/{BATCH_SIZE} collected", end='\r')
                
                break  # Only one camera
        
        # Draw on display frame
        if display_frame is not None:
            # Draw bounding box if available
            if current_avg_bbox is not None:
                x1, y1, x2, y2 = current_avg_bbox
                cv2.rectangle(display_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Draw prediction text above bounding box
                text_y = max(y1 - 10, 30)
                cv2.putText(display_frame, current_prediction, (x1, text_y), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
            
            # Draw collection progress
            progress_text = f"Frames: {len(frame_batch)}/{BATCH_SIZE}"
            cv2.putText(display_frame, progress_text, (10, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
            
            # Show the frame
            cv2.imshow('Action Recognition', display_frame)
        
        # Control frame rate - wait for next frame
        if frame_added:
            time.sleep(frame_interval)
        else:
            time.sleep(0.001)

        # Process batch when we have enough frames
        if len(frame_batch) >= BATCH_SIZE:
            print(f"\n\n{'='*60}")
            print(f"Processing batch of {len(frame_batch)} frames...")
            print('='*60)

            # Variables to track for cleanup
            cropped_batch = None
            writer = None
            cap = None
            all_boxes = None
            
            try:
                # STEP 1 — DETECT PERSON IN ALL FRAMES
                all_boxes = []
                print("Detecting person in all frames...")

                for idx, frame in enumerate(frame_batch):
                    results = model_yolo(frame, conf=0.5, verbose=False)
                    r = results[0]
                    best_box = None
                    max_area = 0
                    h, w = frame.shape[:2]

                    for box in r.boxes:
                        if int(box.cls.item()) == 0:  # person class
                            xyxy = box.xyxy[0].cpu().numpy()
                            x1, y1, x2, y2 = xyxy
                            clamped = clamp_bbox((x1, y1, x2, y2), h, w)
                            if not clamped:
                                continue 
                            x1, y1, x2, y2 = clamped

                            area = (x2 - x1) * (y2 - y1)

                            if area > max_area:
                                max_area = area
                                best_box = (int(x1), int(y1), int(x2), int(y2))
                    
                    all_boxes.append(best_box)
                    
                    # Clear results for this frame
                    del results, r
                
                print(f"✓ Detection complete: {sum(1 for b in all_boxes if b is not None)}/{len(all_boxes)} frames with person detected")
                torch.cuda.empty_cache()

                # STEP 2 — COMPUTE AVERAGE BOUNDING BOX
                valid_boxes = [box for box in all_boxes if box is not None]
                
                if len(valid_boxes) == 0:
                    print("✗ No person detected in any frame. Skipping batch.")
                    current_prediction = "No person detected"
                    current_avg_bbox = None
                    frame_batch.clear()
                    last_frame_hash = None
                    clear_memory()
                    continue
                
                # Calculate average bounding box coordinates
                avg_x1 = int(np.mean([box[0] for box in valid_boxes]))
                avg_y1 = int(np.mean([box[1] for box in valid_boxes]))
                avg_x2 = int(np.mean([box[2] for box in valid_boxes]))
                avg_y2 = int(np.mean([box[3] for box in valid_boxes]))
                
                print(f"✓ Average bounding box computed from {len(valid_boxes)} detections")
                print(f"  Average box: ({avg_x1}, {avg_y1}) -> ({avg_x2}, {avg_y2})")
                
                # STEP 3 — APPLY PADDING TO AVERAGE BOX
                PADDING = 15
                H, W = frame_batch[0].shape[:2]
                
                avg_x1 = max(0, avg_x1 - PADDING)
                avg_y1 = max(0, avg_y1 - PADDING)
                avg_x2 = min(W, avg_x2 + PADDING)
                avg_y2 = min(H, avg_y2 + PADDING)
                
                # Store for display
                current_avg_bbox = (avg_x1, avg_y1, avg_x2, avg_y2)
                current_prediction = "Processing..."
                
                print(f"  With padding: ({avg_x1}, {avg_y1}) -> ({avg_x2}, {avg_y2})")
                
                # STEP 4 — CROP ALL FRAMES WITH THE SAME AVERAGE BOX
                cropped_batch = []
                print(f"Cropping all {len(frame_batch)} frames with average bounding box...")
                
                for frame in frame_batch:
                    crop = frame[avg_y1:avg_y2, avg_x1:avg_x2]
                    cropped_batch.append(crop)

                print(f"✓ Cropped {len(cropped_batch)} frames")

                # STEP 5 — SAVE VIDEO
                timestamp = time.strftime("%Y%m%d_%H%M%S")
                video_filename = f"cropped_video_{timestamp}.mp4"
                video_path = os.path.join(output_folder, video_filename)

                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                crop_H, crop_W = cropped_batch[0].shape[:2]
                
                print(f"Saving video: {video_filename} ({crop_W}x{crop_H} @ {TARGET_FPS} FPS)")
                writer = cv2.VideoWriter(video_path, fourcc, TARGET_FPS, (crop_W, crop_H))

                for f in cropped_batch:
                    writer.write(f)

                writer.release()
                writer = None
                print(f"✓ Video saved successfully!")

                # Verify video was saved correctly
                if os.path.exists(video_path):
                    file_size = os.path.getsize(video_path) / 1024
                    print(f"  File size: {file_size:.2f} KB")
                    
                    cap = cv2.VideoCapture(video_path)
                    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                    cap.release()
                    cap = None
                    print(f"  Frames in video: {total_frames}")

                # STEP 6 — RUN ACTION RECOGNITION
                print("\nRunning action recognition...")
                predicted_class = run_mvit_inference(video_path, mvit_model, device)
                
                # Update display with prediction
                if predicted_class:
                    current_prediction = predicted_class
                    print(f"\n*** PREDICTION: {predicted_class} ***\n")

            except Exception as e:
                print(f"\n✗ Error during processing: {e}")
                import traceback
                traceback.print_exc()
                current_prediction = "Error during processing"
            
            finally:
                # CRITICAL: Always clean up, even if error occurs
                if writer is not None:
                    writer.release()
                if cap is not None:
                    cap.release()
                
                # Clear all batch data
                frame_batch.clear()
                if cropped_batch is not None:
                    cropped_batch.clear()
                    del cropped_batch
                if all_boxes is not None:
                    all_boxes.clear()
                    del all_boxes
                
                last_frame_hash = None
                
                # Force garbage collection and GPU memory clear
                clear_memory()
                
                print(f"\n{'='*60}")
                print("Batch complete! Memory cleared. Waiting for next batch...")
                print(f"{'='*60}\n")

        # Check for quit key
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            print("\nQuitting...")
            break

finally:
    manager.stop_all_cameras()
    cv2.destroyAllWindows()
    clear_memory()
    print("Cleanup complete.")

Added camera cam_1: Camera 1
Created 1 cameras
  cam_1: Camera 1 - rtsp://admin:admin%40123@192.168.0.101:554/stream1


  return FileStore(store_uri, store_uri)
  from .autonotebook import tqdm as notebook_tqdm


Started camera cam_1
Started 1/1 cameras

LOADING MODEL

1. Trying to load from MLflow...
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'model': Failed to download artifacts from path 'model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/pytorch_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'pytorch_model': Failed to download artifacts from path 'pytorch_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/mvit_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


  Failed with artifact name 'mvit_model': Failed to download artifacts from path 'mvit_model', please ensure that the path is correct.
Trying to load from: runs:/840f3d39813c41ba9880859c83a82b01/best_model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 179.04it/s]  


✓ Model loaded successfully from MLflow artifact 'best_model'!

✓ Model loaded and ready!



: 