In [1]:
import sys, os
try:
    from google.colab import drive, userdata
    IS_COLAB = True
except ImportError:
    IS_COLAB = False

REPO_NAME = 'MistakeDetection'

if IS_COLAB:
    print("☁️ Colab rilevato.")
    if not os.path.exists('/content/drive'): drive.mount('/content/drive')

    GITHUB_USER = 'MarcoPernoVDP'
    try:
        TOKEN = userdata.get('GITHUB_TOKEN')
        REPO_URL = f'https://{TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git'
    except:
        REPO_URL = f'https://github.com/{GITHUB_USER}/{REPO_NAME}.git'

    ROOT_DIR = f'/content/{REPO_NAME}'
    if not os.path.exists(ROOT_DIR):
        !git clone {REPO_URL}
    else:
        %cd {ROOT_DIR}
        !git pull
        %cd /content


else:
    print("Ambiente locale rilevato.")
    ROOT_DIR = os.getcwd()
    while not os.path.exists(os.path.join(ROOT_DIR, '.gitignore')) and ROOT_DIR != os.path.dirname(ROOT_DIR):
        ROOT_DIR = os.path.dirname(ROOT_DIR)

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)


Ambiente locale rilevato.


In [None]:
import cv2
import torch
import numpy as np
import torchvision.transforms as T
from egovlp_model import EgoVLP  # importa il modello dal repo EgoVLP

# -----------------------------
# Configurazione
# -----------------------------
VIDEO_PATH = "video.mp4"
CLIP_LEN = 16           # frame per clip
OVERLAP = 8             # slide di 8 frame (opzionale)
DEVICE = "cuda"
FEATURE_SAVE_PATH = "video_features.pt"

# Trasformazioni dei frame
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

# -----------------------------
# Funzione per dividere il video in clip
# -----------------------------
def split_video(video_path, clip_len=16, overlap=0):
    cap = cv2.VideoCapture(video_path)
    frames = []
    clips = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)

        if len(frames) == clip_len:
            clips.append(frames.copy())
            frames = frames[clip_len - overlap:]  # slide per overlap

    # Ultima clip (padding con ultimo frame se necessario)
    if 0 < len(frames) < clip_len:
        while len(frames) < clip_len:
            frames.append(frames[-1])
        clips.append(frames)

    cap.release()
    return clips  # lista di clip, ogni clip è lista di frame

# -----------------------------
# Funzione per preprocessare clip
# -----------------------------
def preprocess_clip(clip):
    proc_frames = [transform(f) for f in clip]  # [C,H,W] per frame
    clip_tensor = torch.stack(proc_frames, dim=1)  # [C, T, H, W]
    return clip_tensor

# -----------------------------
# Main extraction
# -----------------------------
def extract_features(video_path):
    # Carica modello pretrained EgoVLP
    model = EgoVLP.from_pretrained("path/to/egovlp_checkpoint")
    model.eval().to(DEVICE)

    # Suddividi video in clip
    clips = split_video(video_path, clip_len=CLIP_LEN, overlap=OVERLAP)
    all_features = []

    with torch.no_grad():
        for clip in clips:
            clip_tensor = preprocess_clip(clip).unsqueeze(0).to(DEVICE)  # [1,C,T,H,W]
            feat = model.video_encoder(clip_tensor)  # [1, D]
            all_features.append(feat.cpu())

    # Concatena tutte le clip
    all_features = torch.cat(all_features, dim=0)  # [num_clips, D]
    torch.save(all_features, FEATURE_SAVE_PATH)
    print(f"Feature salvate in {FEATURE_SAVE_PATH}, shape: {all_features.shape}")

# -----------------------------
# Esecuzione
# -----------------------------
if __name__ == "__main__":
    extract_features(VIDEO_PATH)


[array([[[[  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],
 
         [[  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],
 
         [[  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],
 
         ...,
 
         [[199, 172, 134],
          [199, 172, 134],
          [199, 172, 133],
          ...,
          [166,  65,  56],
          [164,  63,  54],
          [162,  61,  52]],
 
         [[199, 172, 134],
          [199, 172, 134],
          [199, 172, 133],
          ...,
          [163,  62,  52],
          [162,  61,  51],
          [161,  60,  50]],
 
         [[199, 172, 134],
          [199, 172, 134],
          [199, 172, 133],
   