# Data Preparation: Video Preprocessing Pipeline

This notebook implements a complete preprocessing pipeline, focusing on preparing input data for models of task1 and task2

## Pipeline Overview

The goal is to process full-length surgical videos and convert them into compact, information-rich numpy files, each representing a single video with:
- A sequence of 300 key frame embeddings
- The corresponding skill label for task1 (GRS 0-3) and task2 (OSATS)

## Step-by-Step

1. **Load videos** from OSSDataset/videos.
2. **Subsample frames** at 3 frames per second to reduce temporal redundancy.
3. **Apply transformations** to each frame:
   - Center crop
   - Resize to 224x224
   - Normalize
4. **Extract CNN features** using a pretrained ResNet50.
5. **Select 300 representative frames** using KMeans clustering on the extracted embeddings.
6. **Export each video’s tensor and label** into a `.npy` file.

This preprocessing approach aims to reduce video size while maintaining semantic diversity.


### 0. Inicialization

In [1]:
#Imports
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

import torch
import torchvision.transforms as transforms
import torchvision.models as models

from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

In [2]:
# GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
         for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

device = get_default_device()   
print(device)

cuda


In [3]:
# Paths and global variables 
VIDEO_FOLDER = '../../OSS_dataset/Train/videos/'
LABELS_PATH = '../../OSS_dataset/Train/OSATS.csv'

OUTPUT_FOLDER = './data_processed/'
OUTPUT_FOLDER = './data_processed/'

FPS = 3
CROP = 896
RESIZE = 224
BATCH_SIZE = 128

### 1. Collection of Videos and labels

In [4]:
# Label collect
labels_task_1 = {}  # video_name → int
labels_task_2 = {}  # video_name → list[int]

df_labels = pd.read_csv(LABELS_PATH, sep=';')
df_labels.set_index('VIDEO', inplace=True)

grouped = df_labels.groupby(df_labels.index)

for video_name, group in grouped:
    # Task 1: GRS
    grs_mean = int(round(group["GLOBA_RATING_SCORE"].mean()))
    
    # Task 2: OSATS
    osats_cols = [
        "OSATS_RESPECT", "OSATS_MOTION", "OSATS_INSTRUMENT", "OSATS_SUTURE",
        "OSATS_FLOW", "OSATS_KNOWLEDGE", "OSATS_PERFORMANCE", "OSATS_FINAL_QUALITY"
    ]
    osats_means = [int(round(group[col].mean())) for col in osats_cols]
    
    labels_task_1[video_name] = grs_mean
    labels_task_2[video_name] = osats_means

print(f"Label_1: {len(labels_task_1.keys())}")
print(f"Label_2: {len(labels_task_2.keys())}")


Label_1: 314
Label_2: 314


In [5]:
# Video paths collect
video_paths = {}

for f in os.listdir(VIDEO_FOLDER):
    if f.endswith('.mp4'):
        name = os.path.splitext(f)[0]  # remove .mp4
        if name in labels_task_1 and name in labels_task_2:
            path = os.path.join(VIDEO_FOLDER, f)
            video_paths[name] = path

print(f"Videos: {len(video_paths)}")

Videos: 314


### 2. Video Sampler (3 fps)

In [6]:
# Video Sampler
def extract_frames(video_path, fps=FPS):
    frames = []
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print(f"Error: {video_path}")
        return frames

    video_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(video_fps // fps)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if count % frame_interval == 0:
            # Convert BGR (OpenCV) to RGB (PIL)
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(frame_rgb)
            frames.append(image)

        count += 1

    cap.release()
    return frames
    

### 3. Crop + Resize + Normalize (using imageNet)

In [7]:
transform = transforms.Compose([
    transforms.CenterCrop(CROP),  
    transforms.Resize((RESIZE, RESIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet stats because we will use a pre-trainned RESNET
                         std=[0.229, 0.224, 0.225])
])

### 4. Video Processing
- Feature Extration Using Resnet50 (T, 2048)
- K_means to collect the most relevant frames (300)

In [8]:
# Loading ResNet50
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # remove FC
resnet.to(device)
resnet.eval()

Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [9]:
@torch.no_grad()
def process_video(video_path):
    pil_frames = extract_frames(video_path, fps=FPS)
    transformed_frames = [transform(f) for f in pil_frames]  # (N, 3, 224, 224)

    features = []
    for i in range(0, len(transformed_frames), BATCH_SIZE):
        batch = torch.stack(transformed_frames[i:i + BATCH_SIZE]).to(device)  # (B, 3, 224, 224)
        output = resnet(batch).squeeze(-1).squeeze(-1)  # (B, 2048)
        features.append(output.cpu())

    features = torch.cat(features, dim=0).numpy()  # (T, 2048)

    kmeans = KMeans(n_clusters=300, random_state=0).fit(features)
    selected_indices = []
    for center in kmeans.cluster_centers_:
        idx = np.argmin(np.linalg.norm(features - center, axis=1))
        selected_indices.append(idx)

    selected_indices = sorted(set(selected_indices))[:300]
    selected_features = features[selected_indices]

    return selected_features.astype(np.float32)  # (300, 2048)

### 5. Saving to numpy file 
- features: (300,2048) float32
- label_task_1: int32
- label_task_2: (8,) int32

In [10]:
for video_name, video_path in tqdm(video_paths.items(), desc="Processing"):
    try:
        features = process_video(video_path)
        if features is None or features.shape != (300, 2048):
            print(f"Ignored: {video_name}")
            continue

        # Caminho de saída
        output_path = os.path.join(OUTPUT_FOLDER, f"{video_name}.npz")
        
        np.savez(
            output_path,
            features=features,
            label_task_1=np.array(labels_task_1[video_name], dtype=np.int32),
            label_task_2=np.array(labels_task_2[video_name], dtype=np.int32)
        )

    except Exception as e:
        print(f" Error {video_name}: {e}")

Processing: 100%|██████████| 314/314 [2:41:58<00:00, 30.95s/it]  
