# Packages

In [2]:
from ultralytics import YOLO
import cv2
import json
from pathlib import Path
import random
from datetime import datetime, timedelta
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import torch
import faiss
from sentence_transformers import SentenceTransformer
from collections import Counter




# Visual Inspector

In [30]:
#path configurations
INPUT_DIR = Path("images/")
OUTPUT_DIR = Path("outputs/")
CROPS_DIR = OUTPUT_DIR / "crops"


OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CROPS_DIR.mkdir(parents=True, exist_ok=True)

LOCATIONS = ["zoo", "street", "restaurant", "river", "beach"]

def detect_objects():
    #load model
    model = YOLO("yolov8n-seg.pt")
    results = []
    
    threshold = float(input("\nSet minimum confidence threshold: "))
    
    #allow for user input to be either a percentage or a decimal
    if threshold > 1.0:
        threshold = threshold / 100.0
    
    classes_input = input("Enter target classes separated by commas (or leave blank for all): ")
    target_classes = [c.strip() for c in classes_input.split(",")] if classes_input != "" else []
    
    
    for idx, img_path in enumerate(INPUT_DIR.glob("*.*")):
        #yolo model inference
        outputs = model.predict(
            source=str(img_path),
            conf=float(threshold),
            save=True,
            save_txt=False,
            project=str(OUTPUT_DIR),
            name="annotated",
            classes=[k for k, v in model.names.items() if v in target_classes] if target_classes else None
        )
        
        img = cv2.imread(str(img_path))
    
    
        #random timestamp within the last 7 days (for the agentic section)
        now = datetime.now()            
        random_seconds = random.randint(0, 7 * 24 * 60 * 60)
        random_timestamp = now - timedelta(seconds=random_seconds)
        
        for output in outputs:
            if output.masks is not None:
                masks = output.masks.data.cpu().numpy()
            else:
                masks = []
    
    
            for i, box in enumerate(output.boxes):
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cls_id = int(box.cls[0])
                cls_name = model.names[cls_id]
                conf = float(box.conf[0])
                
                #save crop
                crop_dir = CROPS_DIR / cls_name
                crop_dir.mkdir(parents=True, exist_ok=True)
                crop_img = img[y1:y2, x1:x2]
                crop_fname = f"{img_path.stem}_{cls_name}_{x1}_{y1}_{x2}_{y2}.jpg"
                cv2.imwrite(str(crop_dir / crop_fname), crop_img)
    
    
                if len(masks) > i:
                    mask = (masks[i] * 255).astype(np.uint8)  # convert to 0-255
                    mask_path = OUTPUT_DIR / f"{img_path.stem}_{cls_name}_mask.png"
                    cv2.imwrite(str(mask_path), mask)
    
    
                
                results.append({
                    "image": img_path.name,
                    "class": cls_name,
                    "confidence": conf,
                    "bbox": [x1, y1, x2, y2],
    
                    #fake metadata for agentic section
                    "camera": f"CAM{idx}", 
                    "timestamp": str(random_timestamp),
                    "location": LOCATIONS[idx],
                })
    
    
    with open(OUTPUT_DIR / "detections.json", "w") as f:
        json.dump(results, f, indent=2)
detect_objects()
detect_objects()


Set minimum confidence threshold:  0.4
Enter target classes separated by commas (or leave blank for all):  elephant, car, person



image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356125.jpg: 480x640 2 persons, 1 elephant, 21.8ms
Speed: 2.7ms preprocess, 21.8ms inference, 4.3ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1moutputs\annotated[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356169.jpg: 480x640 3 cars, 17.8ms
Speed: 1.8ms preprocess, 17.8ms inference, 4.4ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1moutputs\annotated2[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356248.jpg: 640x480 4 persons, 22.0ms
Speed: 1.7ms preprocess, 22.0ms inference, 6.2ms postprocess per image at shape (1, 3, 640, 480)
Results saved to [1moutputs\annotated3[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356261.jpg: 480x640 (no detections), 16.5ms
Speed: 1.9ms preprocess, 16.5ms inference, 3.4ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1moutputs\annotated4[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\ima


Set minimum confidence threshold:  0.4
Enter target classes separated by commas (or leave blank for all):  



image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356125.jpg: 480x640 2 persons, 1 elephant, 26.6ms
Speed: 2.4ms preprocess, 26.6ms inference, 4.2ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1moutputs\annotated6[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356169.jpg: 480x640 3 cars, 27.3ms
Speed: 63.5ms preprocess, 27.3ms inference, 4.4ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1moutputs\annotated7[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356248.jpg: 640x480 4 persons, 1 bottle, 2 cups, 1 chair, 1 potted plant, 25.8ms
Speed: 1.3ms preprocess, 25.8ms inference, 3.7ms postprocess per image at shape (1, 3, 640, 480)
Results saved to [1moutputs\annotated8[0m

image 1/1 C:\Users\Aes\Desktop\Telit_test\images\000000356261.jpg: 480x640 1 horse, 14.4ms
Speed: 1.2ms preprocess, 14.4ms inference, 3.9ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1moutputs\annotated9[0m

image 1/

# Agentic AI

In [26]:
YOLO_JSON_PATH = Path("outputs/detections.json")
MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
INTENT_LABELS = ["fetch_latest_detections", "summarize_events", "report_statistics"]


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

CAMERAS = [f"CAM{i}" for i in range(1, 6)]
LOCATIONS = ["zoo", "street", "restaurant", "river", "beach"]

#extract parameters from user request
def extract_parameters(text):
    params = {"camera": None, "date": None, "location": None}

    for cam in CAMERAS:
        if cam.lower() in text.lower():
            params["camera"] = cam

    for loc in LOCATIONS:
        if loc.lower() in text.lower():
            params["location"] = loc

    for word in text.split():
        try:
            datetime.strptime(word, "%Y-%m-%d")
            params["date"] = word
        except ValueError:
            pass

    return params

#classify intents into "fetch_latest_detections", "summarize_events", or "report_statistics"
def classify_request(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        intent_id = torch.argmax(logits, dim=-1).item()
        intent = INTENT_LABELS[intent_id % len(INTENT_LABELS)]
    return intent

#filter documents based on user request
def filter_documents(documents, params):
    filtered = documents
    if params.get("date"):
        filtered = [d for d in filtered if d["date"].startswith(params["date"])]
    if params.get("camera"):
        filtered = [d for d in filtered if str(d["camera"]) == params["camera"]]
    if params.get("location"):
        filtered = [d for d in filtered if params["location"].lower() in d["location"].lower()]
    return filtered


def fetch_latest_detections(params):
    if not YOLO_JSON_PATH.exists():
        return "No detection data found."

    with open(YOLO_JSON_PATH) as f:
        detections = json.load(f)

    filtered = filter_documents(detections, params)
    return filtered

def summarize_events(params):
    with open(YOLO_JSON_PATH) as f:
        detections = json.load(f)

    filtered = filter_documents(detections, params)    
    return f"Summary for {params or 'all data'}: {len(filtered)} notable detections."


def report_statistics(params):
    with open(YOLO_JSON_PATH) as f:
        detections = json.load(f)
        
    filtered = filter_documents(detections, params)

    total_detections = len(filtered)
    unique_classes = len(set(d["class"] for d in filtered))
    return f"Statistics for {params or 'all data'}: {total_detections} detections, {unique_classes} unique classes."


#handle request, then run the appropriate actions
def handle_request(user_input):

    #the embedding model has trouble classifying requests as "report_statistics", so it is handled separately here. 
    if any(word in user_input.lower() for word in ["stats", "statistics", "report"]):
        intent = "report_statistics"
    else:
        intent = classify_request(user_input)
    params = extract_parameters(user_input)
    

    #multi-part request handling
    if " and " in user_input.lower():
        parts = [p.strip() for p in user_input.split(" and ")]
        responses = []
        for p in parts:
            sub_intent = classify_request(p)
            sub_params = extract_parameters(p)
            responses.append(run_action(sub_intent, sub_params))
        return responses
    else:
        return run_action(intent, params)

#run the action based on the intent, return the appropriate result
def run_action(intent, params):
    try:
        with open("requests.json", "r") as f:
            history = json.load(f)
    except FileNotFoundError:
        history = []

    

    if intent == "fetch_latest_detections":
        response =  fetch_latest_detections(params)
    elif intent == "summarize_events":
        response = summarize_events(params)
    elif intent == "report_statistics":
        response =  report_statistics(params)
    else:
        response = "Sorry, I didn't understand your request."

    entry = {
        "user": {
            "request type": intent,
            "parameters": params,
        },
        "Assistant": response
    }

    history.append(entry)
    with open("requests.json", "w") as f:
        json.dump(history, f, indent=2)
    return response

response = handle_request("Fetch latest detections from CAM1 and summarize events from the zoo.")
print(f"Assistant: {response}")

print("\n")
response = handle_request("Report statistics for CAM4.")
print(f"Assistant: {response}")

Assistant: [[{'image': '000000356169.jpg', 'class': 'car', 'confidence': 0.5915552377700806, 'bbox': [365, 323, 381, 336], 'camera': 'CAM1', 'timestamp': '2025-08-12 05:23:30.777668', 'location': 'street'}, {'image': '000000356169.jpg', 'class': 'car', 'confidence': 0.4703880548477173, 'bbox': [332, 323, 357, 337], 'camera': 'CAM1', 'timestamp': '2025-08-12 05:23:30.777668', 'location': 'street'}, {'image': '000000356169.jpg', 'class': 'car', 'confidence': 0.45832645893096924, 'bbox': [0, 295, 56, 357], 'camera': 'CAM1', 'timestamp': '2025-08-12 05:23:30.777668', 'location': 'street'}], "Summary for {'camera': None, 'date': None, 'location': 'zoo'}: 3 notable detections."]


Assistant: Statistics for {'camera': 'CAM4', 'date': None, 'location': None}: 1 detections, 1 unique classes.


# YOLO + Agent Pipeline

In [28]:
VIDEO_PATH = "20250815_131659.mp4" 
OUTPUT_JSON = "video_detections.json"
CONF_THRES = 0.3
MODEL_PATH = "yolov8n-seg.pt"
CLASSES = ["elephant", "person", "car", "horse", "bottle", "cup", "chair", "potted plant"] #these classes can be found in the dataset
LOCATIONS = ["zoo", "street", "restaurant", "river", "beach", "outside"]


#run YOLO on video and save detections to JSON
def detect_objects():
    model = YOLO(MODEL_PATH)
    cap = cv2.VideoCapture(VIDEO_PATH)
    frame_idx = 0
    results = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        output = model.predict(source=frame, conf=CONF_THRES, verbose=False)

        for o in output:
            for i, box in enumerate(o.boxes):
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cls_id = int(box.cls[0])
                cls_name = model.names[cls_id]
                conf = float(box.conf[0])


                mask_points = None
                if o.masks is not None:
                    mask_points = o.masks.xy[i].tolist()  # polygon points

                #fake metadata
                start = datetime.now()
                curr_time = start + timedelta(seconds=frame_idx / 60) #assuming 60 FPS video


                results.append({
                    "frame": frame_idx,
                    "class": cls_name,
                    "confidence": conf,
                    "bbox": [x1, y1, x2, y2],
                    "timestamp": str(curr_time),
                    "camera": 5,
                    "location": "outside"
                })

        frame_idx += 1

    cap.release()

    with open(OUTPUT_JSON, "w") as f:
        json.dump(results, f, indent=2)

    print(f"[INFO] Saved detections to {OUTPUT_JSON}")


def add_embeddings(documents, embed_model):
    for doc in documents:
        doc["embedding"] = embed_model.encode(doc["text"], convert_to_numpy=True)

def build_documents(detections, embed_model):
    docs = []
    for d in detections:
        text = f"Camera {d['camera']} detected a {d['class']} with confidence {d['confidence']:.2f} at {d['timestamp']} in {d['location']}."
        docs.append({
            "text": text,
            "metadata": d
        })
    add_embeddings(docs, embed_model)
    return docs

def query_rag(question, documents, embed_model, k=5):
    #extract filters
    date, camera, location, obj_class = None, None, None, None

    #date
    for word in question.split():
        try:
            date = datetime.strptime(word, "%Y-%m-%d").date()
            break
        except ValueError:
            continue

    #camera
    cam_match = re.search(r'CAM(\d+)', question, re.IGNORECASE)
    if cam_match:
        camera = int(cam_match.group(1))

    #location
    for loc in LOCATIONS:
        if loc.lower() in question.lower():
            location = loc
            break

    #object class
    for c in CLASSES:
        if c.lower() in question.lower():
            obj_class = c
            break

    #filter documents
    filtered_docs = documents
    if date:
        filtered_docs = [d for d in filtered_docs
                         if datetime.fromisoformat(d["metadata"]["timestamp"]).date() == date]
    if camera:
        filtered_docs = [d for d in filtered_docs if d["metadata"]["camera"] == camera]
    if location:
        filtered_docs = [d for d in filtered_docs if d["metadata"]["location"].lower() == location.lower()]
    if obj_class:
        filtered_docs = [d for d in filtered_docs if d["metadata"]["class"].lower() == obj_class.lower()]

    if not filtered_docs:
        return "No detections found for your query."

    #FAISS search
    emb_matrix = np.stack([d["embedding"] for d in filtered_docs])
    q_emb = embed_model.encode(question, convert_to_numpy=True)
    index = faiss.IndexFlatL2(emb_matrix.shape[1])
    index.add(emb_matrix)
    distances, indices = index.search(np.array([q_emb]), k)

    retrieved_docs = [filtered_docs[i]["text"] for i in indices[0] if i < len(filtered_docs)]
    return retrieved_docs[0]  #return top result


detect_objects()

with open(OUTPUT_JSON) as f:
    detections = json.load(f)

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
documents = build_documents(detections, embed_model)
print(query_rag("Which camera saw the most persons yesterday?", documents, embed_model))
print(query_rag("List timestamps where a car was detected.", documents, embed_model))

[INFO] Saved detections to video_detections.json
Camera 5 detected a person with confidence 0.43 at 2025-08-15 17:28:04.003175 in outside.
Camera 5 detected a car with confidence 0.89 at 2025-08-15 17:27:58.185019 in outside.
