In [2]:
import os
import re
import io
import time
import math
import glob
import json
import shutil
import datetime
from typing import List, Tuple, Dict

from PIL import Image
import numpy as np

# Detector: ultralytics YOLO
try:
    from ultralytics import YOLO
except Exception as e:
    raise RuntimeError("Please install 'ultralytics' (pip install ultralytics).") from e

# BLIP captioning from Hugging Face / transformers
try:
    from transformers import BlipForConditionalGeneration, BlipProcessor
except Exception as e:
    raise RuntimeError("Please install 'transformers' and 'torch' (pip install transformers torch).") from e

# DB
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, Text, TIMESTAMP
from sqlalchemy.dialects.postgresql import JSONB
from urllib.parse import quote_plus

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# ============== CONFIG ==============
# Input frames folder (each frame is an image file)
INPUT_FRAMES_FOLDER = "/home/gotoxico/BD2-Trabalho/projetoBD2/Frames"           # folder with subfolders or images
OUTPUT_CROPS_FOLDER = "/home/gotoxico/BD2-Trabalho/projetoBD2/CroppedPersons"           # where cropped person images will be saved
OUTPUT_RECORDS_FOLDER = "/home/gotoxico/BD2-Trabalho/projetoBD2/Records"   # <-- new folder for JSON records

CAMERA_ID = 1                              # camera identifier to store in DB
YOLO_WEIGHTS = "yolo11n.pt"                        # None => use 'yolov8n' builtin; or path to your .pt (e.g., "yolov11.pt")
YOLO_CONF = 0.35                           # detection confidence threshold
IOU_MATCH_THRESHOLD = 0.3                  # tracker IoU threshold to match detections across frames
MIN_BOX_AREA = 400                         # ignore tiny boxes
# BLIP model (Hugging Face)
BLIP_MODEL = "Salesforce/blip-image-captioning-base"   # or "Salesforce/blip-image-captioning-large"
# PostgreSQL connection string (SQLAlchemy)
password = quote_plus("Skatingpussy1989@")
DB_URL = "postgresql+psycopg2://postgres:{password}@localhost:5432/BD2"
# Optional: limit number of frames processed (None for all)
FRAME_LIMIT = None

# ======================================

# Supported image extensions
IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")

In [6]:
#!/usr/bin/env python3
"""
pipeline_frames_to_db.py

- Input: folder with frame images (any common image extension).
- Detect people with a YOLO model (Ultralytics YOLO API used here; you can point to a YOLOv11 weights file).
- Track across frames using a simple IoU-based tracker (stable IDs).
- Crop each detection and caption it with BLIP (Hugging Face).
- Save crop images and metadata to PostgreSQL.

Configure paths and DB settings in the CONFIG section.
"""

# ---------- Utilities ----------
def ensure_dir(path: str):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

def list_frame_files(input_folder: str) -> List[str]:
    files = []
    for root, _, filenames in os.walk(input_folder):
        for f in sorted(filenames):
            if os.path.splitext(f)[1].lower() in IMAGE_EXTS:
                files.append(os.path.join(root, f))
    return files

def parse_timestamp_from_filename(fname: str) -> str:
    """
    Try to parse timestamp from filename using common patterns:
    - 20250915_143321
    - 2025-09-15T14:33:21
    - frame_000123 (fallback to file mtime)
    Returns ISO 8601 string.
    """
    base = os.path.basename(fname)
    # Patterns
    m = re.search(r"(\d{4})[-_]?(\d{2})[-_]?(\d{2})[_T\-]?(\d{2})[:_]?(\d{2})[:_]?(\d{2})", base)
    if m:
        year,mon,day,h,mn,s = m.groups()
        try:
            dt = datetime.datetime(int(year),int(mon),int(day),int(h),int(mn),int(s))
            return dt.isoformat()
        except Exception:
            pass
    # ISO-like
    m2 = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})", base)
    if m2:
        return m2.group(1)
    # fallback: file modification time
    ts = os.path.getmtime(fname)
    return datetime.datetime.fromtimestamp(ts).isoformat()

# ---------- Simple IoU tracker (replaceable by ByteTrack) ----------
def iou(boxA, boxB):
    # boxes are [x1,y1,x2,y2]
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH
    if interArea == 0:
        return 0.0
    boxAArea = (boxA[2]-boxA[0])*(boxA[3]-boxA[1])
    boxBArea = (boxB[2]-boxB[0])*(boxB[3]-boxB[1])
    return interArea / float(boxAArea + boxBArea - interArea)

class SimpleTracker:
    """
    Very small IoU-based tracker that assigns stable IDs.
    Keeps last box per track and matches by IoU threshold.
    """
    def __init__(self, iou_threshold=0.3, max_lost=30):
        self.next_id = 1
        self.tracks = {}  # id -> {box, last_seen_frame_index, lost}
        self.iou_threshold = iou_threshold
        self.max_lost = max_lost

    def update(self, detections: List[Tuple[float, float, float, float]], frame_idx:int):
        """
        detections: list of boxes [x1,y1,x2,y2]
        returns list of (track_id, box)
        """
        assignments = []
        unmatched_dets = set(range(len(detections)))
        # compute matches
        for tid, info in list(self.tracks.items()):
            best_i = -1
            best_iou = 0.0
            for di in unmatched_dets:
                val = iou(info['box'], detections[di])
                if val > best_iou:
                    best_iou = val
                    best_i = di
            if best_i != -1 and best_iou >= self.iou_threshold:
                # match
                self.tracks[tid]['box'] = detections[best_i]
                self.tracks[tid]['last_seen'] = frame_idx
                self.tracks[tid]['lost'] = 0
                assignments.append((tid, detections[best_i]))
                unmatched_dets.remove(best_i)
            else:
                # increment lost
                self.tracks[tid]['lost'] += 1

        # remove dead tracks
        to_delete = [tid for tid, info in self.tracks.items() if info['lost'] > self.max_lost]
        for tid in to_delete:
            del self.tracks[tid]

        # create new tracks for unmatched detections
        for di in sorted(unmatched_dets):
            tid = self.next_id
            self.next_id += 1
            self.tracks[tid] = {'box': detections[di], 'last_seen': frame_idx, 'lost': 0}
            assignments.append((tid, detections[di]))

        return assignments

# ---------- Detector wrapper ----------
class Detector:
    def __init__(self, weights: str = None, conf: float = 0.25):
        # If weights is None, use a small built-in YOLO model (yolov8n)
        if weights:
            self.model = YOLO(weights)
        else:
            # ultralytics accepts 'yolov8n.pt' string or model name
            self.model = YOLO("yolov8n")  # change if you want another
        self.conf = conf

    def detect(self, image: np.ndarray):
        """
        image: HxWxC BGR (OpenCV style) or RGB numpy
        returns: list of detections as dicts: {bbox: [x1,y1,x2,y2], confidence, class_id, class_name}
        """
        # ultralytics expects either path or numpy (RGB)
        # convert BGR->RGB if needed: assume input is RGB
        res = self.model.predict(image, imgsz=640, conf=self.conf, verbose=False)
        # res is a list (batch); get first
        results = res[0]
        detections = []
        if results.boxes is not None:
            boxes = results.boxes.xyxy.cpu().numpy()  # Nx4
            scores = results.boxes.conf.cpu().numpy()
            cls = results.boxes.cls.cpu().numpy().astype(int)
            for b,s,c in zip(boxes, scores, cls):
                detections.append({
                    'bbox':[float(b[0]), float(b[1]), float(b[2]), float(b[3])],
                    'confidence':float(s),
                    'class_id':int(c),
                    'class_name': self.model.names[c] if hasattr(self.model, "names") else str(c)
                })
        return detections

# ---------- Captioner (BLIP) ----------
class BLIPCaptioner:
    def __init__(self, model_name=BLIP_MODEL, device="cuda"):
        import torch
        self.device = device
        print("Loading BLIP model (this may take a while)...")
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def caption(self, pil_image: Image.Image, max_length=30) -> str:
        import torch
        inputs = self.processor(images=pil_image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            out = self.model.generate(**inputs, max_new_tokens=max_length)
        text = self.processor.decode(out[0], skip_special_tokens=True)
        return text

# ---------- Database (SQLAlchemy) ----------
def create_metadata_table(engine):
    meta = MetaData()
    records = Table(
        "person_tracks",
        meta,
        Column("id", Integer, primary_key=True, autoincrement=True),
        Column("track_id", Integer, nullable=False),
        Column("timestamp", TIMESTAMP, nullable=False),
        Column("camera_id", Integer, nullable=True),
        Column("description", Text, nullable=True),
        Column("bbox", JSONB, nullable=False),   # store as json object {x1,y1,x2,y2}
        Column("image_path", String, nullable=False)
    )
    meta.create_all(engine)
    return records

# ---------- Main pipeline ----------
def run_pipeline():
    # Prepare folders
    ensure_dir = lambda p: os.makedirs(p, exist_ok=True) if not os.path.exists(p) else None
    ensure_dir(OUTPUT_CROPS_FOLDER)

    # Prepare DB
    engine = create_engine(DB_URL, echo=False, future=True)
    records_table = create_metadata_table(engine)

    # Load models
    detector = Detector(weights=YOLO_WEIGHTS, conf=YOLO_CONF)
    device = "cuda" if (os.environ.get("CUDA_VISIBLE_DEVICES") is not None or os.name != "nt") else "cpu"
    # If torch.cuda.is_available() is desired use try/except import torch
    try:
        import torch
        device = "cuda" if torch.cuda.is_available() else "cpu"
    except Exception:
        device = "cpu"
    captioner = BLIPCaptioner(model_name=BLIP_MODEL, device=device)

    # Tracker (simple)
    tracker = SimpleTracker(iou_threshold=IOU_MATCH_THRESHOLD, max_lost=30)

    frame_files = list_frame_files(INPUT_FRAMES_FOLDER)
    if FRAME_LIMIT:
        frame_files = frame_files[:FRAME_LIMIT]
    if not frame_files:
        print("No frames found in", INPUT_FRAMES_FOLDER)
        return

    print(f"Found {len(frame_files)} frames, processing...")

    frame_idx = 0
    with engine.begin() as conn:
        for frame_path in frame_files:
            frame_idx += 1
            # Load image (as RGB numpy for ultralytics + PIL for crop)
            pil = Image.open(frame_path).convert("RGB")
            img_np = np.array(pil)  # RGB
            # Run detector
            dets = detector.detect(img_np)
            # Filter only 'person' class if model uses coco names
            person_dets = []
            for d in dets:
                name = d.get('class_name', '').lower()
                if name in ("person", "people", "human") or int(d.get('class_id', -1)) == 0:
                    x1,y1,x2,y2 = [int(round(v)) for v in d['bbox']]
                    area = (x2-x1)*(y2-y1)
                    if area >= MIN_BOX_AREA:
                        person_dets.append([x1,y1,x2,y2])

            # Update tracker and get assignments
            assignments = tracker.update(person_dets, frame_idx)  # list of (track_id, box)
            timestamp = parse_timestamp_from_filename(frame_path)

            for track_id, box in assignments:
                x1,y1,x2,y2 = [int(v) for v in box]
                # crop (use PIL)
                crop = pil.crop((x1, y1, x2, y2))
                # optional: resize crop to reasonable size for BLIP (e.g., 384x384)
                crop_for_blip = crop.resize((384,384), Image.LANCZOS)
                # caption
                try:
                    caption = captioner.caption(crop_for_blip)
                except Exception as e:
                    caption = ""
                    print("Caption error:", e)

                # save crop image with filename including track and timestamp
                fname_ts = timestamp.replace(":", "").replace("-", "")
                fname = f"cam{CAMERA_ID}_trk{track_id}_{os.path.splitext(os.path.basename(frame_path))[0]}_{fname_ts}.jpg"
                out_path = os.path.join(OUTPUT_CROPS_FOLDER, fname)
                crop.save(out_path, format="JPEG", quality=90)

                # insert record into DB
                rec = {
                    "track_id": int(track_id),
                    "timestamp": datetime.datetime.fromisoformat(timestamp),
                    "camera_id": int(CAMERA_ID),
                    "description": caption,
                    "bbox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
                    "image_path": out_path
                }
                # insert
                conn.execute(records_table.insert().values(**rec))

            if frame_idx % 50 == 0:
                print(f"Processed {frame_idx}/{len(frame_files)} frames")

    print("Done. Crops saved to:", OUTPUT_CROPS_FOLDER)

if __name__ == "__main__":
    run_pipeline()


OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [17]:
#!/usr/bin/env python3
"""
pipeline_frames_to_files.py

Same pipeline as before, but instead of saving records to PostgreSQL, it writes each
record as a JSON file into OUTPUT_RECORDS_FOLDER. Cropped person images are saved to
OUTPUT_CROPS_FOLDER as before.

Configure paths and options in the CONFIG section.
"""

# ---------- Utilities ----------
def ensure_dir(path: str):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)


def list_frame_files(input_folder: str) -> List[str]:
    files = []
    for root, _, filenames in os.walk(input_folder):
        for f in sorted(filenames):
            if os.path.splitext(f)[1].lower() in IMAGE_EXTS:
                files.append(os.path.join(root, f))
    return files


def parse_timestamp_from_filename(fname: str) -> str:
    """
    Try to parse timestamp from filename using common patterns:
    - 20250915_143321
    - 2025-09-15T14:33:21
    - frame_000123 (fallback to file mtime)
    Returns ISO 8601 string.
    """
    base = os.path.basename(fname)
    m = re.search(r"(\d{4})[-_]?(\d{2})[-_]?(\d{2})[_T\-]?(\d{2})[:_]?(\d{2})[:_]?(\d{2})", base)
    if m:
        year, mon, day, h, mn, s = m.groups()
        try:
            dt = datetime.datetime(int(year), int(mon), int(day), int(h), int(mn), int(s))
            return dt.isoformat()
        except Exception:
            pass
    m2 = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})", base)
    if m2:
        return m2.group(1)
    ts = os.path.getmtime(fname)
    return datetime.datetime.fromtimestamp(ts).isoformat()


# ---------- Simple IoU tracker ----------
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH
    if interArea == 0:
        return 0.0
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    return interArea / float(boxAArea + boxBArea - interArea)


class SimpleTracker:
    def __init__(self, iou_threshold=0.3, max_lost=30):
        self.next_id = 1
        self.tracks = {}  # id -> {box, last_seen_frame_index, lost}
        self.iou_threshold = iou_threshold
        self.max_lost = max_lost

    def update(self, detections: List[Tuple[float, float, float, float]], frame_idx: int):
        assignments = []
        unmatched_dets = set(range(len(detections)))
        for tid, info in list(self.tracks.items()):
            best_i = -1
            best_iou = 0.0
            for di in unmatched_dets:
                val = iou(info['box'], detections[di])
                if val > best_iou:
                    best_iou = val
                    best_i = di
            if best_i != -1 and best_iou >= self.iou_threshold:
                self.tracks[tid]['box'] = detections[best_i]
                self.tracks[tid]['last_seen'] = frame_idx
                self.tracks[tid]['lost'] = 0
                assignments.append((tid, detections[best_i]))
                unmatched_dets.remove(best_i)
            else:
                self.tracks[tid]['lost'] += 1
        to_delete = [tid for tid, info in self.tracks.items() if info['lost'] > self.max_lost]
        for tid in to_delete:
            del self.tracks[tid]
        for di in sorted(unmatched_dets):
            tid = self.next_id
            self.next_id += 1
            self.tracks[tid] = {'box': detections[di], 'last_seen': frame_idx, 'lost': 0}
            assignments.append((tid, detections[di]))
        return assignments


# ---------- Detector wrapper ----------
class Detector:
    def __init__(self, weights: str = None, conf: float = 0.25, device: str = "cpu"):
        self.device = device
        if weights:
            self.model = YOLO(weights)
        else:
            self.model = YOLO("yolov8n")
        self.conf = conf

    def detect(self, image: np.ndarray):
        # ultralytics expects numpy RGB
        res = self.model.predict(image, imgsz=640, conf=self.conf, verbose=False, device=self.device)
        results = res[0]
        detections = []
        if results.boxes is not None:
            boxes = results.boxes.xyxy.cpu().numpy()
            scores = results.boxes.conf.cpu().numpy()
            cls = results.boxes.cls.cpu().numpy().astype(int)
            for b, s, c in zip(boxes, scores, cls):
                detections.append({
                    'bbox': [float(b[0]), float(b[1]), float(b[2]), float(b[3])],
                    'confidence': float(s),
                    'class_id': int(c),
                    'class_name': self.model.names[c] if hasattr(self.model, "names") else str(c)
                })
        return detections


# ---------- Captioner (BLIP) ----------
class BLIPCaptioner:
    def __init__(self, model_name=BLIP_MODEL, device="cpu"):
        import torch
        self.device = device
        print("Loading BLIP model (this may take a while)...")
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def caption(self, pil_image: Image.Image, max_length=60) -> str:
        import torch
        inputs = self.processor(images=pil_image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            out = self.model.generate(**inputs, max_new_tokens=max_length)
        text = self.processor.decode(out[0], skip_special_tokens=True)
        return text


# ---------- Main pipeline ----------
def run_pipeline():
    # Prepare folders
    ensure_dir(OUTPUT_CROPS_FOLDER)
    ensure_dir(OUTPUT_RECORDS_FOLDER)

    # Load models
    # device detection
    try:
        import torch
        device = "cuda" if torch.cuda.is_available() else "cpu"
    except Exception:
        device = "cpu"
    print(f"Using device: {device}")

    detector = Detector(weights=YOLO_WEIGHTS, conf=YOLO_CONF, device=device)
    captioner = BLIPCaptioner(model_name=BLIP_MODEL, device=device)

    # Tracker
    tracker = SimpleTracker(iou_threshold=IOU_MATCH_THRESHOLD, max_lost=30)

    frame_files = list_frame_files(INPUT_FRAMES_FOLDER)
    if FRAME_LIMIT:
        frame_files = frame_files[:FRAME_LIMIT]
    if not frame_files:
        print("No frames found in", INPUT_FRAMES_FOLDER)
        return

    print(f"Found {len(frame_files)} frames, processing...")

    frame_idx = 0
    for frame_path in frame_files:
        frame_idx += 1
        pil = Image.open(frame_path).convert("RGB")
        img_np = np.array(pil)  # RGB
        dets = detector.detect(img_np)

        # keep only person detections
        person_dets_xyxy = []
        for d in dets:
            name = d.get('class_name', '').lower()
            if name in ("person", "people", "human") or int(d.get('class_id', -1)) == 0:
                x1, y1, x2, y2 = [int(round(v)) for v in d['bbox']]
                area = (x2 - x1) * (y2 - y1)
                if area >= MIN_BOX_AREA:
                    person_dets_xyxy.append([x1, y1, x2, y2])

        assignments = tracker.update(person_dets_xyxy, frame_idx)
        timestamp = parse_timestamp_from_filename(frame_path)
        frame_base = os.path.splitext(os.path.basename(frame_path))[0]

        for track_id, box in assignments:
            x1, y1, x2, y2 = [int(v) for v in box]
            crop = pil.crop((x1, y1, x2, y2))
            crop_for_blip = crop.resize((384, 384), Image.LANCZOS)
            try:
                caption = captioner.caption(crop_for_blip)
            except Exception as e:
                caption = ""
                print("Caption error:", e)

            # save crop image
            fname_ts = timestamp.replace(":", "").replace("-", "")
            img_fname = f"cam{CAMERA_ID}_trk{track_id}_{frame_base}_{fname_ts}.jpg"
            out_img_path = os.path.join(OUTPUT_CROPS_FOLDER, img_fname)
            crop.save(out_img_path, format="JPEG", quality=90)

            # build record dict
            record = {
                "track_id": int(track_id),
                "timestamp": timestamp,
                "camera_id": int(CAMERA_ID),
                "description": caption,
                "bbox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
                "image_path": out_img_path
            }

            # save record JSON with a unique filename
            record_fname = f"record_cam{CAMERA_ID}_trk{track_id}_{frame_base}_{fname_ts}.json"
            record_path = os.path.join(OUTPUT_RECORDS_FOLDER, record_fname)
            try:
                with open(record_path, "w", encoding="utf-8") as fh:
                    json.dump(record, fh, ensure_ascii=False, indent=2)
            except Exception as e:
                print("Failed to write record:", e)

        if frame_idx % 50 == 0:
            print(f"Processed {frame_idx}/{len(frame_files)} frames")

    print("Done. Crops saved to:", OUTPUT_CROPS_FOLDER)
    print("Records saved to:", OUTPUT_RECORDS_FOLDER)


if __name__ == "__main__":
    run_pipeline()


Using device: cuda
Loading BLIP model (this may take a while)...
Found 78347 frames, processing...


KeyboardInterrupt: 

In [1]:
#!/usr/bin/env python3
"""
pipeline_frames_to_files_with_attributes.py

Full pipeline:
- Uses Ultralytics YOLO (v8) for person detection
- Uses BLIP for short captions
- Uses CLIP zero-shot to extract non-sensitive attributes (hair color, clothing item, clothing color, accessories)
- Saves cropped person images and JSON records to disk

Important: this code intentionally does NOT infer or record sensitive attributes such as skin tone, race, religion, or sexual orientation.

Requirements:
  pip install ultralytics transformers torch pillow numpy opencv-python
"""

import os
import json
import datetime
import re
from typing import List, Tuple
from PIL import Image
import numpy as np
import cv2

# -------- CONFIG --------
INPUT_FRAMES_FOLDER = "/home/gotoxico/BD2-Trabalho/projetoBD2/Frames"
OUTPUT_CROPS_FOLDER = "/home/gotoxico/BD2-Trabalho/projetoBD2/CroppedPersons"
OUTPUT_RECORDS_FOLDER = "/home/gotoxico/BD2-Trabalho/projetoBD2/Records"

YOLO_WEIGHTS = "yolo11n.pt"       
YOLO_CONF = 0.35    
BLIP_MODEL = "Salesforce/blip-image-captioning-base"
CAPTION_MAX_TOKENS = 160
CAPTION_MAX_WORDS = 80

FRAME_LIMIT = 1000
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}
CAMERA_ID = 1
MIN_BOX_AREA = 900
IOU_MATCH_THRESHOLD = 0.3
ATTR_TOP_K = 2
ATTR_CONF_THRESHOLD = 0.05

# -------- optional imports (fail gracefully) ----------
try:
    from ultralytics import YOLO
except Exception:
    YOLO = None

try:
    from transformers import BlipProcessor, BlipForConditionalGeneration
except Exception:
    BlipProcessor = None
    BlipForConditionalGeneration = None

try:
    from transformers import CLIPProcessor, CLIPModel
except Exception:
    CLIPProcessor = None
    CLIPModel = None

# -------- Utilities --------
def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def list_frame_files(input_folder: str) -> List[str]:
    files = []
    for root, _, filenames in os.walk(input_folder):
        for f in sorted(filenames):
            if os.path.splitext(f)[1].lower() in IMAGE_EXTS:
                files.append(os.path.join(root, f))
    return files

def parse_timestamp_from_filename(fname: str) -> str:
    base = os.path.basename(fname)
    m = re.search(r"(\d{4})[-_]?(\d{2})[-_]?(\d{2})[_T\-]?(\d{2})[:_]?(\d{2})[:_]?(\d{2})", base)
    if m:
        year, mon, day, h, mn, s = m.groups()
        try:
            dt = datetime.datetime(int(year), int(mon), int(day), int(h), int(mn), int(s))
            return dt.isoformat()
        except Exception:
            pass
    m2 = re.search(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})", base)
    if m2:
        return m2.group(1)
    ts = os.path.getmtime(fname)
    return datetime.datetime.fromtimestamp(ts).isoformat()

def shorten_text(text: str, max_words: int = CAPTION_MAX_WORDS) -> str:
    if not text:
        return ""
    text = text.strip()
    first = re.split(r'[.?!]\s*', text)[0].strip()
    words = first.split()
    if len(words) <= max_words:
        return first
    return " ".join(words[:max_words])

# --------- Simple IoU tracker ----------
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interW = max(0, xB - xA)
    interH = max(0, yB - yA)
    interArea = interW * interH
    if interArea == 0:
        return 0.0
    boxAArea = max(1.0, (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = max(1.0, (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))
    return interArea / float(boxAArea + boxBArea - interArea)

class SimpleTracker:
    def __init__(self, iou_threshold=0.3, max_lost=30):
        self.next_id = 1
        self.tracks = {}
        self.iou_threshold = iou_threshold
        self.max_lost = max_lost

    def update(self, detections: List[Tuple[float, float, float, float]], frame_idx: int):
        assignments = []
        unmatched_dets = set(range(len(detections)))
        for tid, info in list(self.tracks.items()):
            best_i = -1
            best_iou = 0.0
            for di in list(unmatched_dets):
                val = iou(info['box'], detections[di])
                if val > best_iou:
                    best_iou = val
                    best_i = di
            if best_i != -1 and best_iou >= self.iou_threshold:
                self.tracks[tid]['box'] = detections[best_i]
                self.tracks[tid]['last_seen'] = frame_idx
                self.tracks[tid]['lost'] = 0
                assignments.append((tid, detections[best_i]))
                unmatched_dets.remove(best_i)
            else:
                self.tracks[tid]['lost'] += 1
        to_delete = [tid for tid, info in self.tracks.items() if info['lost'] > self.max_lost]
        for tid in to_delete:
            del self.tracks[tid]
        for di in sorted(unmatched_dets):
            tid = self.next_id
            self.next_id += 1
            self.tracks[tid] = {'box': detections[di], 'last_seen': frame_idx, 'lost': 0}
            assignments.append((tid, detections[di]))
        return assignments

# --------- Detector wrapper ----------
class Detector:
    def __init__(self, weights: str = None, conf: float = 0.25, device: str = "cpu"):
        self.device = device
        if YOLO is None:
            raise RuntimeError("ultralytics YOLO not installed (pip install ultralytics)")
        if weights:
            self.model = YOLO(weights)
        else:
            self.model = YOLO("yolov8n")
        self.conf = conf

    def detect(self, image: np.ndarray):
        res = self.model.predict(image, imgsz=640, conf=self.conf, verbose=False, device=self.device)
        results = res[0]
        detections = []
        if getattr(results, 'boxes', None) is not None:
            boxes = results.boxes.xyxy.cpu().numpy()
            scores = results.boxes.conf.cpu().numpy()
            cls = results.boxes.cls.cpu().numpy().astype(int)
            names = getattr(self.model, "names", None)
            for b, s, c in zip(boxes, scores, cls):
                detections.append({
                    'bbox': [float(b[0]), float(b[1]), float(b[2]), float(b[3])],
                    'confidence': float(s),
                    'class_id': int(c),
                    'class_name': names[int(c)] if names is not None and int(c) in names else str(int(c))
                })
        return detections

# --------- BLIP captioner ----------
class BLIPCaptioner:
    def __init__(self, model_name=BLIP_MODEL, device="cpu"):
        if BlipProcessor is None or BlipForConditionalGeneration is None:
            raise RuntimeError("transformers with BLIP not installed (pip install transformers)")
        import torch
        self.device = device
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def caption(self, pil_image: Image.Image, max_length=CAPTION_MAX_TOKENS) -> str:
        import torch
        inputs = self.processor(images=pil_image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            out = self.model.generate(**inputs, max_new_tokens=max_length, num_beams=5, temperature=0.7, early_stopping=True)
        text = self.processor.decode(out[0], skip_special_tokens=True)
        return shorten_text(text, max_words=CAPTION_MAX_WORDS)

# --------- CLIP attribute matcher (zero-shot, non-sensitive attrs) ----------
class CLIPAttributeMatcher:
    def __init__(self, device="cpu"):
        if CLIPProcessor is None or CLIPModel is None:
            raise RuntimeError("transformers with CLIP not installed (pip install transformers)")
        import torch
        self.device = device
        print("Loading CLIP model for attributes...")
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        self.hair_colors = ["blond hair", "brown hair", "black hair", "gray hair", "red hair", "dark hair", "light hair"]
        self.clothing_items = ["t-shirt", "shirt", "jacket", "hoodie", "dress", "coat", "suit", "skirt", "pants", "shorts"]
        self.clothing_colors = [
            "red shirt", "blue shirt", "black shirt", "white shirt", "green shirt",
            "red jacket", "blue jacket", "black jacket", "white jacket", "green jacket",
            "gray sweater", "striped shirt", "plaid shirt"
        ]
        self.accessories = ["wearing glasses", "wearing sunglasses", "wearing a hat", "backpack", "holding a bag"]

    def _score_texts(self, pil_image: Image.Image, texts: List[str]):
        inputs = self.processor(text=texts, images=pil_image, return_tensors="pt", padding=True).to(self.device)
        with __import__('torch').no_grad():
            outputs = self.model(**inputs)
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=1).cpu().numpy().flatten()
        return probs.tolist()

    def _top_filtered(self, pil_image: Image.Image, texts: List[str], top_k=2, min_conf=ATTR_CONF_THRESHOLD):
        probs = self._score_texts(pil_image, texts)
        idxs = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:top_k]
        return [{"label": texts[i], "score": float(probs[i])} for i in idxs if probs[i] >= min_conf]

    def get_attributes(self, pil_image: Image.Image, top_k=2):
        attrs = {}
        attrs['hair'] = self._top_filtered(pil_image, self.hair_colors, top_k=top_k)
        attrs['clothing_items'] = self._top_filtered(pil_image, self.clothing_items, top_k=top_k)
        attrs['clothing_colors'] = self._top_filtered(pil_image, self.clothing_colors, top_k=top_k)
        attrs['accessories'] = self._top_filtered(pil_image, self.accessories, top_k=top_k)
        return attrs

# --------- Main pipeline ----------
def run_pipeline_single_record():
    ensure_dir(OUTPUT_CROPS_FOLDER)
    ensure_dir(OUTPUT_RECORDS_FOLDER)

    try:
        import torch
        device = "cuda" if torch.cuda.is_available() else "cpu"
    except Exception:
        device = "cpu"
    print(f"Using device: {device}")

    # Initialize detector, captioner, attr matcher, tracker
    detector = Detector(weights=YOLO_WEIGHTS, conf=YOLO_CONF, device=device)
    captioner = BLIPCaptioner(model_name=BLIP_MODEL, device=device)
    attr_matcher = CLIPAttributeMatcher(device=device)
    tracker = SimpleTracker(iou_threshold=IOU_MATCH_THRESHOLD, max_lost=30)

    frame_files = list_frame_files(INPUT_FRAMES_FOLDER)
    if FRAME_LIMIT:
        frame_files = frame_files[:FRAME_LIMIT]
    if not frame_files:
        print("No frames found in", INPUT_FRAMES_FOLDER)
        return

    print(f"Found {len(frame_files)} frames, processing...")

    frame_idx = 0
    track_records = {}

    for frame_path in frame_files:
        frame_idx += 1
        pil = Image.open(frame_path).convert("RGB")
        img_np = np.array(pil)

        # --- Detection ---
        dets = detector.detect(img_np)

        person_dets_xyxy = []
        for d in dets:
            name = d.get('class_name', '').lower()
            if name in ("person", "people", "human") or int(d.get('class_id', -1)) == 0:
                x1, y1, x2, y2 = [int(round(v)) for v in d['bbox']]
                area = (x2 - x1) * (y2 - y1)
                if area >= MIN_BOX_AREA:
                    person_dets_xyxy.append([x1, y1, x2, y2])

        assignments = tracker.update(person_dets_xyxy, frame_idx)
        timestamp = parse_timestamp_from_filename(frame_path)
        frame_base = os.path.splitext(os.path.basename(frame_path))[0]

        for track_id, box in assignments:
            x1, y1, x2, y2 = [int(v) for v in box]
            crop = pil.crop((x1, y1, x2, y2))
            crop_for_blip = crop.resize((384, 384), Image.LANCZOS)

            if track_id not in track_records:
                try:
                    caption = captioner.caption(crop_for_blip)
                except Exception as e:
                    caption = ""
                    print("Caption error:", e)
                try:
                    attributes = attr_matcher.get_attributes(crop_for_blip, top_k=ATTR_TOP_K)
                except Exception as e:
                    attributes = {}
                    print("Attribute matcher error:", e)

                fname_ts = timestamp.replace(":", "").replace("-", "")
                first_crop_fname = f"cam{CAMERA_ID}_trk{track_id}_{frame_base}_{fname_ts}_first.jpg"
                first_crop_path = os.path.join(OUTPUT_CROPS_FOLDER, first_crop_fname)
                try:
                    crop.save(first_crop_path, format="JPEG", quality=90)
                except Exception as e:
                    print("Failed to save first crop:", e)

                track_records[track_id] = {
                    "track_id": track_id,
                    "timestamp_start": timestamp,
                    "timestamp_end": timestamp,
                    "first_crop_path": first_crop_path,
                    "last_crop_path": first_crop_path,
                    "description": caption,
                    "attributes": attributes,
                    "bbox_start": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
                    "bbox_end": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
                }
            else:
                fname_ts = timestamp.replace(":", "").replace("-", "")
                last_crop_fname = f"cam{CAMERA_ID}_trk{track_id}_{frame_base}_{fname_ts}_last.jpg"
                last_crop_path = os.path.join(OUTPUT_CROPS_FOLDER, last_crop_fname)
                try:
                    crop.save(last_crop_path, format="JPEG", quality=90)
                except Exception as e:
                    print("Failed to save last crop:", e)

                track_records[track_id]["timestamp_end"] = timestamp
                track_records[track_id]["last_crop_path"] = last_crop_path
                track_records[track_id]["bbox_end"] = {"x1": x1, "y1": y1, "x2": x2, "y2": y2}

        if frame_idx % 50 == 0:
            print(f"Processed {frame_idx}/{len(frame_files)} frames")

    # Save records
    for track_id, rec in track_records.items():
        record_fname = f"record_cam{CAMERA_ID}_trk{track_id}.json"
        record_path = os.path.join(OUTPUT_RECORDS_FOLDER, record_fname)
        try:
            with open(record_path, "w", encoding="utf-8") as fh:
                json.dump(rec, fh, ensure_ascii=False, indent=2)
        except Exception as e:
            print("Failed to write record:", e)

    print("Done. Crops saved to:", OUTPUT_CROPS_FOLDER)
    print("Records saved to:", OUTPUT_RECORDS_FOLDER)


if __name__ == "__main__":
    run_pipeline_single_record()


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cuda
Loading CLIP model for attributes...
Found 1000 frames, processing...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processed 50/1000 frames
Processed 100/1000 frames
Processed 150/1000 frames
Processed 200/1000 frames
Processed 250/1000 frames
Processed 300/1000 frames
Processed 350/1000 frames
Processed 400/1000 frames
Processed 450/1000 frames
Processed 500/1000 frames
Processed 550/1000 frames
Processed 600/1000 frames
Processed 650/1000 frames
Processed 700/1000 frames
Processed 750/1000 frames
Processed 800/1000 frames
Processed 850/1000 frames
Processed 900/1000 frames
Processed 950/1000 frames
Processed 1000/1000 frames
Done. Crops saved to: /home/gotoxico/BD2-Trabalho/projetoBD2/CroppedPersons
Records saved to: /home/gotoxico/BD2-Trabalho/projetoBD2/Records
