## Actual run that I found the best solution

Using:

- cv2 to capture the image
- Yolo to detect the person - include this in litreature review
- https://docs.ultralytics.com/models/yolov8/ 


In [None]:
# Initial things to use the drone - new features
# Tello does not work with out model 

from djitellopy import tello
from time import sleep
import cv2


me = tello.Tello()
# To connect 
me.connect()

# # To start 
me.streamon()

# # To takeoff
# me.takeoff()

# Open a window to display the video feed
while True:
    # Get the video frame
    frame = me.get_frame_read().frame

    # Resize the frame for better display (optional)
    frame = cv2.resize(frame, (640, 480))
    # frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)   
    # img = cv2.cvtColor(framw, cv2.COLOR_BGR2RGB)   


    # Show the frame
    cv2.imshow("Tello Camera", frame)

    

    cv2.waitKey(1)

    me.send_rc_control(10, 0, 0, 5)

    # me.land()

    

# # To control movement
# # me.send_rc_control(left/right, forward/backward, up/down, yaw_velocity)

# # To stop - for 5 seconds
# me.sleep(60)

# # To land
# me.land()

In [None]:
# This is more accurate - it tracks every part of a person

import cv2
from ultralytics import YOLO
import math
import logging

# These values need to be fine tuned
K_x = 0.05 # Left/Right movement scale
K_y = 0.1  # Forward/Backward movement scale
K_z = 0.02 # Up/Down movement scale

# Load YOLOv8 model
# Smallest YOLOv8 model
# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
model = YOLO("yolov8n.pt", verbose = False)  

# Read the drone's initial position
drone_x, drone_y, drone_z = 0, 0, 2 

last_position = None
last_area = None
pixel_threshold = 100

# Open webcam
cap = cv2.VideoCapture(0) 

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    # Define screen center
    frame_center_x = frame.shape[1] // 2  # Middle of frame
    frame_center_y = frame.shape[0] // 2 

    if not success:
        break

    # Inference on the frame 
    results = model(frame) 

    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0]) 
            # Confidence score 
            conf = box.conf[0].item()  
            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                # Center of bounding box
                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  
                bbox_height = y2 - y1
                # Calculate area
                area = (x2 - x1) * bbox_height
                
                # Depending on the size of the bounding box the drone will move
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) 
                # This is what the drone will follow
                # Draw center point
                cv2.circle(frame, (cx, cy), 5, (0, 255, 0), cv2.FILLED) 
                # Append the area and the center of the circle
                myPersonList.append((cx, cy))
                myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]


        # To avoid having constant outputs for example when the person moves only a bit 
        if last_position is not None and last_area is not None and current_area > 0:
            # Calculate distance moved in pixels
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            # Calculate the distance moved
            # Euclidean distance 
            distance_moved = math.sqrt(dx**2 + dy**2)

            # Calculate the change in area
            area_change = abs(current_area - last_area)

            # Check if the person moved more than the threshold to avoid having constant moves
            # This is only done to make the movement smoother - this was being calculated even if I barely move which we do not want 
            # We do not want to have a drone that would crash having constant changes in the position 
            if distance_moved > pixel_threshold:
                # Left/Right Movement (X-axis)
                drone_x += K_x * (person_x - frame_center_x)
                # Forward/Backward Movement (Y-axis)
                drone_y += K_y * ((last_area / current_area) - 1)
                # Up/Down Movement (Z-axis)
                drone_z += K_z * (frame_center_y - person_y)
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")
                print(f"New Drone Position: X={drone_x:.2f}, Y={drone_y:.2f}, Z={drone_z:.2f}")
                # Left/Right Movement
                # if person_x < frame_center_x - 50:
                #     direction = "left"
                #     print("Move Left")
                #     # Send command to drone: move left
                # elif person_x > frame_center_x + 50:
                #     direction = "right"  
                #     print("Move Right")
                #     # Send command to drone: move right

                # # Forward/Backward Movement
                # if myPersonListArea[i] < 5000:  # Adjust based on detection area
                #     direction = "forward"
                #     print("Move Forward")
                #     # Send command to drone: move forward
                # elif myPersonListArea[i] > 15000:
                #     direction = "backward"
                #     print("Move Backward")
                #     # Send command to drone: move backward


                # Update last position
                last_position = (person_x, person_y)
                last_area = current_area
        # Track the initial position of the person - assuming this is the space you want to have between the person and the drone 
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            # First detection, initialize last position
            last_position = (person_x, person_y)
            last_area = current_area
                


    cv2.imshow("Person Detection (YOLOv8)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Press 'q' to exit

cap.release()
cv2.destroyAllWindows()


PROBLEMS :
- If there are multiple people it might change the person it is tracking (Maybe for now I will test it with one person)
- It is difficult to find how much the drone should move and I am not sure whether I am doing it well


Considerations -- things to check 
- Latency: Ensure minimal delay between detecting the red spot and sending commands.
- Safety: Test in a controlled environment to ensure predictable movements.
- Camera Feed Access: If using the drone’s camera feed, ensure you can stream it to your processing device.


# Following a Person

# Deep Sort


In [None]:
# This is more accurate - it tracks every part of a person

import cv2
from ultralytics import YOLO
import math
import logging

# Load YOLOv8 model
# Smallest YOLOv8 model
# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
model = YOLO("yolov8n.pt", verbose = False)  

last_position = None
last_area = None
pixel_threshold = 100

# Open webcam
cap = cv2.VideoCapture(0) 

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()

    if not success:
        break

    # Inference on the frame 
    results = model(frame) 

    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0]) 
            # Confidence score 
            conf = box.conf[0].item()  
            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                # Center of bounding box
                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  
                bbox_height = y2 - y1
                # Calculate area
                area = (x2 - x1) * bbox_height
                
                # Depending on the size of the bounding box the drone will move
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) 
                # This is what the drone will follow
                # Draw center point
                cv2.circle(frame, (cx, cy), 5, (0, 255, 0), cv2.FILLED) 
                # Append the area and the center of the circle
                myPersonList.append((cx, cy))
                myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]


        # To avoid having constant outputs for example when the person moves only a bit 
        if last_position is not None and last_area is not None and current_area > 0:
            # Calculate distance moved in pixels
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            # Calculate the distance moved
            # Euclidean distance 
            distance_moved = math.sqrt(dx**2 + dy**2)


            # Check if the person moved more than the threshold to avoid having constant moves
            # This is only done to make the movement smoother - this was being calculated even if I barely move which we do not want 
            # We do not want to have a drone that would crash having constant changes in the position 
            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                # Update last position
                last_position = (person_x, person_y)
                last_area = current_area
        # Track the initial position of the person - assuming this is the space you want to have between the person and the drone 
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            # First detection, initialize last position
            last_position = (person_x, person_y)
            last_area = current_area
                


    cv2.imshow("Person Detection (YOLOv8)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Press 'q' to exit

cap.release()
cv2.destroyAllWindows()


In [None]:
import cv2
import math
import logging
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort


# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
# Load YOLOv8 model (Nano version)
model = YOLO("yolov8n.pt", verbose=False)

# Initialize Deep SORT Tracker 
tracker = DeepSort(max_age=10, n_init=3)

# Open webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# Tracking history
last_position = None
last_area = None
pixel_threshold = 100

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Run YOLOv8 inference
    # Inference on the frame 
    results = model(frame)

    # Prepare Deep Sort input format
    detections = []  
    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID and confidence score
            cls = int(box.cls[0])  
            conf = box.conf[0].item() 

            # Detect only people with confidence score of 0.8 or more
            if cls == 0 and conf > 0.8:  # Detect only people
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                width = x2 - x1 
                height = y2 - y1
                # Calculate area
                area = width * height  

                detections.append(([x1, y1, width, height], conf, cls))

    # Update Deep SORT Tracker
    tracked_objects = tracker.update_tracks(detections, frame=frame)

    for track in tracked_objects:
        if not track.is_confirmed():
            # Ignore unconfirmed tracks
            continue  

        track_id = track.track_id  
        x1, y1, x2, y2 = map(int, track.to_ltrb())  
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  

        # Draw bounding box and ID
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.circle(frame, (cx, cy), 5, (0, 0, 255), cv2.FILLED)
        cv2.putText(frame, f"ID: {track_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Append to lists
        # Append the area and the center of the circle
        myPersonList.append((cx, cy))
        myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]

        if last_position is not None and last_area is not None and current_area > 0:
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            distance_moved = math.sqrt(dx**2 + dy**2)

            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')

                last_position = (person_x, person_y)
                last_area = current_area
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            last_position = (person_x, person_y)
            last_area = current_area

    # Show the frame
    cv2.imshow("YOLOv8 + Deep SORT Tracking", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Exit on 'q'

cap.release()
cv2.destroyAllWindows()


In [None]:
import cv2
import math
import logging
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np


# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
# Load YOLOv8 model (Nano version)
model = YOLO("yolov8n.pt", verbose=False)

tracker = DeepSort(
    max_age=30,  # Increased from 10 to maintain track through brief occlusions
    n_init=5,    # Need 5 consecutive detections to confirm track
    max_iou_distance=0.4,
    max_cosine_distance=0.3,  # Stricter appearance matching
    embedder_model_name="osnet_x1_0",  # Better ReID model
    embedder = "mobilenet",
    half=True  # Use FP16 for faster inference
)

# Open webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# Tracking history
last_position = None
last_area = None
pixel_threshold = 100

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break
 
    # Sharpen the image to make better performance
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(frame, -1, kernel)
    # Run YOLOv8 inference
    # Inference on the frame
    results = model(sharpened)

    # Prepare Deep Sort input format
    detections = []  
    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID and confidence score
            cls = int(box.cls[0])  
            conf = box.conf[0].item() 

            # Detect only people with confidence score of 0.8 or more
            if cls == 0 and conf > 0.8:  # Detect only people
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                width = x2 - x1 
                height = y2 - y1
                # Calculate area
                area = width * height  

                detections.append(([x1, y1, width, height], conf, cls, frame[y1:y2, x1:x2]))  # Add cropped image


    # Update Deep SORT Tracker
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    for track in tracked_objects:
        if not track.is_confirmed():
            # Ignore unconfirmed tracks
            continue  

        track_id = track.track_id  
        x1, y1, x2, y2 = map(int, track.to_ltrb())  
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  

        # Draw bounding box and ID
        cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)
        cv2.putText(sharpened, f"ID: {track_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Append to lists
        # Append the area and the center of the circle
        myPersonList.append((cx, cy))
        myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]

        if last_position is not None and last_area is not None and current_area > 0:
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            distance_moved = math.sqrt(dx**2 + dy**2)

            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')

                last_position = (person_x, person_y)
                last_area = current_area
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            last_position = (person_x, person_y)
            last_area = current_area

    # Show the frame
    cv2.imshow("YOLOv8 + Deep SORT Tracking", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Exit on 'q'

cap.release()
cv2.destroyAllWindows()


## Errors fixed from the version I had deleted

In [1]:
import cv2
import logging
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np
import face_recognition

# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)

# Load YOLOv8 model
model = YOLO("yolov8n.pt", verbose=False)

# Initialize DeepSORT Tracker
tracker = DeepSort(
    max_age=30,  
    n_init=3,    
    max_iou_distance=0.5,
    max_cosine_distance=0.4,  
    embedder_model_name="mobilenetv2",
    half=True,
    embedder_gpu=True
)

# Persistent ID tracking
face_db = {}  # Stores face encodings {unique_id: face_encoding}
person_id_map = {}  # Maps track_id → unique_id
next_person_id = 1

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Sharpen image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(frame, -1, kernel)
    
    # Run YOLOv8 inference
    results = model(sharpened)

    detections = []

    for r in results:
        for box in r.boxes:
            cls = int(box.cls[0])
            conf = box.conf[0].item()
            if cls == 0 and conf > 0.7:  # Only detect people
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                width, height = x2 - x1, y2 - y1
                
                # Extract person crop
                person_crop = sharpened[y1:y2, x1:x2]
                if person_crop.size == 0:
                    continue
                
                detections.append(([x1, y1, width, height], conf, 0, np.zeros((128,))))  # Default 128-dim zero vector

    # Update DeepSORT tracker
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    for track in tracked_objects:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        x1, y1, x2, y2 = map(int, track.to_ltrb())
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2

        # Extract face from person crop
        person_crop = sharpened[y1:y2, x1:x2]
        face_locations = face_recognition.face_locations(person_crop, model="hog")
        
        assigned_id = None  # Will store the final ID

        if face_locations:
            # Adjust coordinates to global frame
            adjusted_faces = [(y1 + top, x1 + right, y1 + bottom, x1 + left) for (top, right, bottom, left) in face_locations]

            # Extract face encodings
            face_encodings = face_recognition.face_encodings(sharpened, known_face_locations=adjusted_faces)

            if face_encodings:
                face_encoding = face_encodings[0]  # Use the first detected face

                # Check if this face matches a known person
                for person_id, saved_encoding in face_db.items():
                    match = face_recognition.compare_faces([saved_encoding], face_encoding, tolerance=0.4)
                    if match[0]:  # Found a match
                        assigned_id = person_id
                        break

                # If no match, assign a new unique ID
                if assigned_id is None:
                    assigned_id = next_person_id
                    face_db[next_person_id] = face_encoding
                    next_person_id += 1

                # Update the mapping to the current track_id
                person_id_map[track_id] = assigned_id

        # If no face was detected, check if we already assigned an ID to this track
        if assigned_id is None:
            assigned_id = person_id_map.get(track_id, None)

        # If track_id is new and has no face, assign a temporary unique ID
        if assigned_id is None:
            assigned_id = next_person_id
            person_id_map[track_id] = assigned_id
            next_person_id += 1

        # Draw tracking box
        cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(sharpened, f"ID: {assigned_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)

    # Show frame
    cv2.imshow("YOLOv8 + DeepSORT + Face Recognition", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  

cap.release()
cv2.destroyAllWindows()


2025-02-23 14:58:59.441 python[6994:41938] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-23 14:58:59.441 python[6994:41938] +[IMKInputSession subclass]: chose IMKInputSession_Modern


KeyboardInterrupt: 

# Final code for tracking all the persons.

In [1]:
import cv2
import logging
import math
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np
import face_recognition

# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)

# Load YOLOv8 model
model = YOLO("yolov8n.pt", verbose=False)

# Initialize DeepSORT Tracker
tracker = DeepSort(
    max_age=30,  
    n_init=3,    
    max_iou_distance=0.5,
    max_cosine_distance=0.4,  
    embedder_model_name="mobilenetv2",
    half=True,
    embedder_gpu=True
)

# Persistent ID tracking
# Stores face encodings {unique_id: face_encoding}
face_db = {}  
# Maps track_id → unique_id
person_id_map = {}  
# Keeps track of assigned person IDs
used_person_ids = set()  
next_person_id = 1
last_position = None
last_area = None
pixel_threshold = 100

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Sharpen image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(frame, -1, kernel)
    
    # Run YOLOv8 inference
    # Inference on the frame 
    results = model(sharpened)
    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []
    # List to hold detections
    detections = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0])
            # Confidence score 
            conf = box.conf[0].item()
            # Class 0 = "person", confidence > 70%
            if cls == 0 and conf > 0.7:  
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0])

                #Calculate the width and the height of the bunding box
                width, height = x2 - x1, y2 - y1
                # Center of bounding box
                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
                # Calculate area
                area = width * height

                # Append the area and the center of the circle
                myPersonList.append((cx, cy))
                myPersonListArea.append(area)

                # Extract person crop
                person_crop = sharpened[y1:y2, x1:x2]
                if person_crop.size == 0:
                    continue
                
                detections.append(([x1, y1, width, height], conf, 0, np.zeros((128,))))  # Default 128-dim zero vector

    # Update DeepSORT tracker
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    for track in tracked_objects:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        x1, y1, x2, y2 = map(int, track.to_ltrb())
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2

        # Extract face from person crop
        person_crop = sharpened[y1:y2, x1:x2]
        face_locations = face_recognition.face_locations(person_crop, model="hog")
        
        assigned_id = None  # Will store the final ID

        if face_locations:
            # Adjust coordinates to global frame
            adjusted_faces = [(y1 + top, x1 + right, y1 + bottom, x1 + left) for (top, right, bottom, left) in face_locations]

            # Extract face encodings
            face_encodings = face_recognition.face_encodings(sharpened, known_face_locations=adjusted_faces)

            if face_encodings:
                face_encoding = face_encodings[0]  # Use the first detected face

                # Check if this face matches a known person
                for person_id, saved_encoding in face_db.items():
                    match = face_recognition.compare_faces([saved_encoding], face_encoding, tolerance=0.55)
                    if match[0]:  # Found a match
                        assigned_id = person_id
                        break

                # If no match, assign a new unique ID
                if assigned_id is None:
                    while next_person_id in used_person_ids:  # Ensure unique ID assignment
                        next_person_id += 1

                    assigned_id = next_person_id
                    face_db[next_person_id] = face_encoding
                    used_person_ids.add(next_person_id)
                    next_person_id += 1

        # Ensure a stable mapping for DeepSORT track_id
        if assigned_id is None:
            assigned_id = person_id_map.get(track_id)

        if assigned_id is None:
            # Assign a new unique person ID only if no previous ID exists
            while next_person_id in used_person_ids:
                next_person_id += 1

            assigned_id = next_person_id
            used_person_ids.add(next_person_id)
            next_person_id += 1

        # Store the mapping between DeepSORT track_id and the stable person_id
        person_id_map[track_id] = assigned_id

        # Draw tracking box
        cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(sharpened, f"ID: {assigned_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]


        # To avoid having constant outputs for example when the person moves only a bit 
        if last_position is not None and last_area is not None and current_area > 0:
            # Calculate distance moved in pixels
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            # Calculate the distance moved
            # Euclidean distance 
            distance_moved = math.sqrt(dx**2 + dy**2)


            # Check if the person moved more than the threshold to avoid having constant moves
            # This is only done to make the movement smoother - this was being calculated even if I barely move which we do not want 
            # We do not want to have a drone that would crash having constant changes in the position 
            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                # Update last position
                last_position = (person_x, person_y)
                last_area = current_area
        # Track the initial position of the person - assuming this is the space you want to have between the person and the drone 
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            # First detection, initialize last position
            last_position = (person_x, person_y)
            last_area = current_area
                
    # Show frame
    cv2.imshow("YOLOv8 + DeepSORT + Face Recognition", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  

cap.release()
cv2.destroyAllWindows()


Area: 302848, Center: (663, 492)


2025-02-23 14:59:25.473 python[7110:42499] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-23 14:59:25.473 python[7110:42499] +[IMKInputSession subclass]: chose IMKInputSession_Modern


KeyboardInterrupt: 

# Final code for tracking only the persons with id 1 - using face recognition this is very good but when I face backwards it no longer detects me. But this still gives the best results.

In [2]:
import cv2
import logging
import math
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np
import face_recognition

# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)

# Load YOLOv8 model
model = YOLO("yolov8n.pt", verbose=False)

# Initialize DeepSORT Tracker
tracker = DeepSort(
    max_age=30,  
    n_init=3,    
    max_iou_distance=0.5,
    max_cosine_distance=0.4,  
    embedder_model_name="mobilenetv2",
    half=True,
    embedder_gpu=True
)

# Persistent ID tracking
# Stores face encodings {unique_id: face_encoding}
face_db = {} 
# Maps track_id → unique_id 
person_id_map = {}  
# Keeps track of assigned person IDs
used_person_ids = set()  
# Start with id 1
next_person_id = 1

last_position = None
last_area = None
# Movement sensitivity
pixel_threshold = 100  

# Open webcam
cap = cv2.VideoCapture(0)

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Sharpen image
    # Kernel to sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    # Use the kernel to filter the image
    sharpened = cv2.filter2D(frame, -1, kernel)
    
    # Run YOLOv8 inference with the sharpened image
    results = model(sharpened)

    # List to hold detections
    detections = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0])  
            # Get the confidence score
            conf = box.conf[0].item()  

            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Bounding box
                x1, y1, x2, y2 = map(int, box.xyxy[0])  
                # Calculate the width and the height
                width, height = x2 - x1, y2 - y1
                # Crop the person
                person_crop = sharpened[y1:y2, x1:x2]

                # If the cropped image has a size of 0 than there must have been an error
                if person_crop.size == 0:
                    continue
                
                # Default 128-dim zero vector
                detections.append(([x1, y1, width, height], conf, 0, np.zeros((128,))))  

    # Update DeepSORT tracker - so the tracker will only track the detected people
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    # Track only ID=1 -> variables to hold everything with id=1
    person_x, person_y, current_area = None, None, 0

    # For each tracked object
    for track in tracked_objects:
        # If there is no tracking continue
        if not track.is_confirmed():
            continue

        # Get track ID
        track_id = track.track_id
        # Get bounding box coordinates
        x1, y1, x2, y2 = map(int, track.to_ltrb())
        # Get the centre
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
        # Get the area of the bounding box
        area = (x2 - x1) * (y2 - y1)

        # Extract face from person crop
        person_crop = sharpened[y1:y2, x1:x2]
        # Get face encodings -> recognise the face
        face_locations = face_recognition.face_locations(person_crop, model="hog")
        
        # Final ID assignment
        assigned_id = None  

        # If there was face recognitions
        if face_locations:
            # Get adjusted bounding box coordinates for face recognition
            adjusted_faces = [(y1 + top, x1 + right, y1 + bottom, x1 + left) for (top, right, bottom, left) in face_locations]
            # Get face encodings
            face_encodings = face_recognition.face_encodings(sharpened, known_face_locations=adjusted_faces)

            # If a face was detected and recognized
            if face_encodings:
                # Use first detected face
                face_encoding = face_encodings[0]  

                # Check if this face matches a known person
                for person_id, saved_encoding in face_db.items():
                    # Compare face encodings
                    match = face_recognition.compare_faces([saved_encoding], face_encoding, tolerance=0.55)
                    # If match, assign the person ID - take the first match
                    if match[0]:  
                        assigned_id = person_id
                        break

                # If no match, assign a new unique ID
                if assigned_id is None:
                    
                    while next_person_id in used_person_ids:  
                        next_person_id += 1


                    assigned_id = next_person_id
                    face_db[next_person_id] = face_encoding
                    used_person_ids.add(next_person_id)
                    next_person_id += 1

        # Ensure stable ID mapping
        if assigned_id is None:
            # Get the person ID
            assigned_id = person_id_map.get(track_id)
            # While the next ID is used
            while next_person_id in used_person_ids:
                # Increment the ID -> we are trying to find a new ID to not have multiple people with the same ID
                next_person_id += 1

            # The assigned ID becomes ecqual to the next person ID
            # Since this is the first free ID
            assigned_id = next_person_id
            # Add this to the set of used IDs
            used_person_ids.add(next_person_id)
            next_person_id += 1

        # Store mapping
        person_id_map[track_id] = assigned_id

        # Only track and draw for ID = 1
        if assigned_id == 1:
            # Get the x and y  co-ordinates, and the area of the person
            person_x, person_y, current_area = cx, cy, area

            # Draw tracking box only for ID = 1
            cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(sharpened, f"ID: {assigned_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)

    # Process movement of ID=1 - if these are not None then a person with ID=1 is found
    if person_x is not None and person_y is not None:
        # If a person with ID=1 had already been found before
        if last_position is not None and last_area is not None and current_area > 0:
            # Calculate distance moved in pixels
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            # Use the Eulidean distance
            distance_moved = math.sqrt(dx**2 + dy**2)

            # If the distance moved is greater than the pixel threshold
            if distance_moved > pixel_threshold:
                # Print the movement details
                print(f'ID={assigned_id} -> Area: {current_area}, Center: ({person_x}, {person_y})')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                # Update the position and the area
                last_position = (person_x, person_y)
                last_area = current_area
        # If a person had never been found before
        else:
            # Print the new area and new center
            print(f'ID={assigned_id} -> Area: {current_area}, Center: ({person_x}, {person_y})')
            # Assign the new position and the new area that has been found to be found again when the person moves
            last_position = (person_x, person_y)
            last_area = current_area
                
    # Show frame
    cv2.imshow("YOLOv8 + DeepSORT + Face Recognition", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  

cap.release()
cv2.destroyAllWindows()


ID=1 -> Area: 308493, Center: (663, 487)


KeyboardInterrupt: 

# Best model using face recognition

Note if you see more than one box on each other if the centre has moved more than the threshold the newly creared box is taken which is what we want and the other box dies after 3 seconds.  If they have the same centre nothing changes in the output (i.e. no distance has been moved)

In [1]:
# Import all the necessary libraries
import cv2
import logging
import math
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np
import face_recognition

# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)

# Load YOLOv8 model
model = YOLO("yolov8n.pt", verbose=False)

# Initialize DeepSORT Tracker
tracker = DeepSort(
    # The ammount of time an ID survives in memory without being removed
    # If this is lost for 10 consecutive frames than it is lost - this won't be a problem since we are relying on face recognition
    max_age=10,  
    # The ammount of times the processer checks if it is the same ID
    n_init=3,    
    # To associate detections - maximum Intersection over Union
    # Lower values make tracking stricter, requiring detections to be very close to previous locations.
    max_iou_distance=0.5,
    # The confidence it has to say that they are the same person
    # The maximum cosine distance between feature embeddings to consider two detections as the same object.
    max_cosine_distance=0.4,  
    # Specifies the model used for feature extraction
    embedder_model_name="mobilenetv2",
    # Enables half-precision floating point (FP16) computations.
    half=True,
    # Turn on the GPU acceleration for feature extract
    embedder_gpu=True
)

# Persistent ID tracking
# Stores face encodings {unique_id: face_encoding}
face_db = {} 
# Maps track_id → unique_id 
person_id_map = {}  
# Keeps track of assigned person IDs
used_person_ids = set()  
# Start with id 1
next_person_id = 1

last_position = None
last_area = None
# Movement sensitivity
pixel_threshold = 100  

# Open webcam
cap = cv2.VideoCapture(0)

# While the webcam is open
while cap.isOpened():
    # Read the next frame from the webcam
    success, frame = cap.read()
    if not success:
        break

    # Sharpen image
    # Kernel to sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    # Use the kernel to filter the image
    sharpened = cv2.filter2D(frame, -1, kernel)
    
    # Run YOLOv8 inference with the sharpened image
    results = model(sharpened)

    # List to hold detections
    detections = []

    # For each detection in the results
    for r in results:
        # For each bounding box in the detection
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0])  
            # Get the confidence score
            conf = box.conf[0].item()  

            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Bounding box
                x1, y1, x2, y2 = map(int, box.xyxy[0])  
                # Calculate the width and the height
                width, height = x2 - x1, y2 - y1
                # Crop the person
                person_crop = sharpened[y1:y2, x1:x2]

                # If the cropped image has a size of 0 than there must have been an error
                if person_crop.size == 0:
                    continue
                
                # Default 128-dim zero vector
                detections.append(([x1, y1, width, height], conf, 0, np.zeros((128,))))  

    # Update DeepSORT tracker - so the tracker will only track the detected people
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    # Track only ID=1 -> variables to hold everything with id=1
    person_x, person_y, current_area = None, None, 0

    # For each tracked object
    for track in tracked_objects:
        # If there is no tracking continue
        if not track.is_confirmed():
            continue

        # If there was a tracking
        # Get track ID
        track_id = track.track_id
        # Get bounding box coordinates
        x1, y1, x2, y2 = map(int, track.to_ltrb())
        # Get the centre
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
        # Get the area of the bounding box
        area = (x2 - x1) * (y2 - y1)

        # Extract face from person crop
        person_crop = sharpened[y1:y2, x1:x2]
        # Get face encodings -> recognise the face
        face_locations = face_recognition.face_locations(person_crop, model="hog")
        
        # Final ID assignment
        assigned_id = None  

        # If there was face recognitions
        if face_locations:
            # Get adjusted bounding box coordinates for face recognition
            adjusted_faces = [(y1 + top, x1 + right, y1 + bottom, x1 + left) for (top, right, bottom, left) in face_locations]
            # Get face encodings
            face_encodings = face_recognition.face_encodings(sharpened, known_face_locations=adjusted_faces)

            # If a face was detected and recognized
            if face_encodings:
                # Use first detected face
                face_encoding = face_encodings[0]  

                # Check if this face matches a known person
                for person_id, saved_encoding in face_db.items():
                    # Compare face encodings
                    match = face_recognition.compare_faces([saved_encoding], face_encoding, tolerance=0.55)
                    # If match, assign the person ID - take the first match
                    if match[0]:  
                        assigned_id = person_id
                        break

                # If no match, assign a new unique ID
                if assigned_id is None:
                    
                    # While the next person is in used ids
                    while next_person_id in used_person_ids:  
                        # Increment to find an unused ID
                        next_person_id += 1


                    # Store mapping - saved the ID that we will be using
                    assigned_id = next_person_id
                    # Store face encoding in a dictionary
                    face_db[next_person_id] = face_encoding
                    # Add this to the set of used IDs
                    used_person_ids.add(next_person_id)
                    # Increment the ID that should be used next
                    next_person_id += 1

        # Ensure stable ID mapping
        if assigned_id is None:
            # Get the person ID
            assigned_id = person_id_map.get(track_id)
            # While the next ID is used
            while next_person_id in used_person_ids:
                # Increment the ID -> we are trying to find a new ID to not have multiple people with the same ID
                next_person_id += 1

            # The assigned ID becomes ecqual to the person ID
            # Since this is the first free ID
            assigned_id = next_person_id
            # Add this to the set of used IDs
            used_person_ids.add(next_person_id)
            # Increment the ID that should be used next
            next_person_id += 1

        # Store mapping
        person_id_map[track_id] = assigned_id

        # Only track and draw for ID = 1
        if assigned_id == 1:
            # Get the x and y  co-ordinates, and the area of the person
            person_x, person_y, current_area = cx, cy, area

            # Draw tracking box only for ID = 1
            cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(sharpened, f"ID: {assigned_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)

            # If a person had been found before - in this case with ID = 1, the first person ever seen 
            if last_position is not None and last_area is not None and current_area > 0:
                # Calculate distance moved in pixels
                dx = person_x - last_position[0]
                dy = person_y - last_position[1]
                # Use the Eulidean distance
                distance_moved = math.sqrt(dx**2 + dy**2)

                # If the distance moved is greater than the pixel threshold
                if distance_moved > pixel_threshold:
                    # Print the movement details, the new area, the ID to ensure that it has ID=1 and the center
                    print(f'ID={assigned_id} -> Area: {current_area}, Center: ({person_x}, {person_y})')
                    print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                    # Update the position and the area
                    last_position = (person_x, person_y)
                    last_area = current_area
            # If a person had never been found before
            else:
                # Print the new area and new center
                print(f'ID={assigned_id} -> Area: {current_area}, Center: ({person_x}, {person_y})')
                # Assign the new position and the new area that has been found to be found again when the person moves
                last_position = (person_x, person_y)
                last_area = current_area
                
    # Show frame
    cv2.imshow("YOLOv8 + DeepSORT + Face Recognition", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  

cap.release()
cv2.destroyAllWindows()


ID=1 -> Area: 248092, Center: (185, 374)


2025-02-23 15:03:28.300 python[7912:45953] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-23 15:03:28.300 python[7912:45953] +[IMKInputSession subclass]: chose IMKInputSession_Modern


KeyboardInterrupt: 

## Body Recognition - this is saying that me and someone else are the same person which is not good - might be only when the people look alike - for example 2 women.

In [1]:
import cv2
import logging
import math
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np

# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)

# Load YOLOv8 model
model = YOLO("yolov8n.pt", verbose=False)

# Initialize DeepSORT Tracker with a Re-ID model
tracker = DeepSort(
    # Max-age is a hyper-parameter to say how much long an id is kept when it disappears 
    max_age=1000,  
    # How much long it will take to shift track a new detection
    n_init=5,    
    max_iou_distance=0.5,
    # To increase discrimination between people
    # Slightly relax to prevent frequent ID switches
    max_cosine_distance=0.3,  
    embedder_model_name="mobilenetv2",
    half=True,
    # Use GPU for embedding 
    embedder_gpu=True
)

# Persistent ID tracking
# Maps track_id → unique_id
person_id_map = {}  
# Keeps track of assigned person IDs
used_person_ids = set()  
# Start with ID = 1
next_person_id = 1  

last_position = None
last_area = None
# Movement sensitivity
pixel_threshold = 100  

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Sharpen image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(frame, -1, kernel)
    
    # Run YOLOv8 inference
    results = model(sharpened)

    # List to hold detections
    detections = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0])  
            # Confidence score
            conf = box.conf[0].item()  

            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Bounding box
                x1, y1, x2, y2 = map(int, box.xyxy[0])  
                width, height = x2 - x1, y2 - y1

                detections.append(([x1, y1, width, height], conf, 0, np.zeros((128,))))  

    # Update DeepSORT tracker
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    # Track only ID=1
    person_x, person_y, current_area = None, None, 0

    for track in tracked_objects:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        x1, y1, x2, y2 = map(int, track.to_ltrb())
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
        area = (x2 - x1) * (y2 - y1)

        # Assign a unique person ID based on DeepSORT's Re-ID features
        assigned_id = person_id_map.get(track_id)

        if assigned_id is None:
            while next_person_id in used_person_ids:  
                next_person_id += 1

            assigned_id = next_person_id
            used_person_ids.add(next_person_id)
            next_person_id += 1

        person_id_map[track_id] = assigned_id

        # Only track and draw for ID = 1
        if assigned_id == 1:
            person_x, person_y, current_area = cx, cy, area

            # Draw tracking box
            cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(sharpened, f"ID: {assigned_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)

    # Process movement of ID=1
    if person_x is not None and person_y is not None:
        if last_position is not None and last_area is not None and current_area > 0:
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            distance_moved = math.sqrt(dx**2 + dy**2)

            if distance_moved > pixel_threshold:
                print(f'ID=1 -> Area: {current_area}, Center: ({person_x}, {person_y})')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                last_position = (person_x, person_y)
                last_area = current_area
        else:
            print(f'ID=1 -> Area: {current_area}, Center: ({person_x}, {person_y})')
            last_position = (person_x, person_y)
            last_area = current_area
                
    # Show frame
    cv2.imshow("YOLOv8 + DeepSORT + Body Recognition", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  

cap.release()
cv2.destroyAllWindows()


2025-02-23 15:01:08.261 python[7591:44349] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-23 15:01:08.261 python[7591:44349] +[IMKInputSession subclass]: chose IMKInputSession_Modern


ID=1 -> Area: 185500, Center: (132, 360)


KeyboardInterrupt: 

#### Changing all hyper-parameters to make better detections.

This is still not working

In [1]:
import cv2
import logging
import math
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np

# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)

# Load YOLOv8 model
model = YOLO("yolov8n.pt", verbose=False)

# Initialize DeepSORT Tracker with a Re-ID model
tracker = DeepSort(
    max_age=50, 
    # Require more frames before confirming a track 
    n_init=5,  
    max_iou_distance=0.5,
    # Slightly relax to prevent frequent ID switches
    max_cosine_distance=0.3,  
    embedder_model_name="mobilenetv2",
    half=True,
    embedder_gpu=True
)

# Track the first detected person
first_person_id = None  

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# Track previous position for movement detection
last_position = None
last_area = None
# Movement sensitivity
pixel_threshold = 100  

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Sharpen image for better detection
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(frame, -1, kernel)
    
    # Run YOLOv8 inference
    results = model(sharpened)

    # List to hold detections
    detections = []

    for r in results:
        for box in r.boxes:
            # Get class ID and confidence score
            cls = int(box.cls[0])  
            conf = box.conf[0].item()  

            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Bounding box
                x1, y1, x2, y2 = map(int, box.xyxy[0])  
                # Finding the width and the height 
                width, height = x2 - x1, y2 - y1

                # Append the newly detected person
                detections.append(([x1, y1, width, height], conf, 0, np.zeros((128,))))  

    # Update DeepSORT tracker
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    # Track only ID=1
    person_x, person_y, current_area = None, None, 0

    # Iterate over tracked objects to find ID=1 person
    for track in tracked_objects:
        # If the track was not confirmed
        if not track.is_confirmed():
            # Stop the for loop
            continue

        # Get the ID of the tracked object
        track_id = track.track_id
        x1, y1, x2, y2 = map(int, track.to_ltrb())
        # Find the centre of the object to mark is as the tracking point
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
        # Find the area of the object to detect movement
        area = (x2 - x1) * (y2 - y1)

        # Assign first detected person as ID=1
        if first_person_id is None:
            # Lock in the first person detected
            first_person_id = track_id  

        # Only track first detected person
        if track_id == first_person_id:
            # Save the previously found results
            person_x, person_y, current_area = cx, cy, area

            # Draw tracking box
            cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(sharpened, f"ID: 1", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)

    # Process movement of ID=1
    if person_x is not None and person_y is not None:
        # If the person has already been identified before
        if last_position is not None and last_area is not None and current_area > 0:
            # Find the change in the x-axis position
            dx = person_x - last_position[0]
            # Find the change in the y-axis position
            dy = person_y - last_position[1]
            # Calculate the distance moved in pixels
            distance_moved = math.sqrt(dx**2 + dy**2)

            # If the distance moved exceeds the threshold, print the details and update the last position and area.
            if distance_moved > pixel_threshold:
                print(f'ID=1 -> Area: {current_area}, Center: ({person_x}, {person_y})')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                last_position = (person_x, person_y)
                last_area = current_area
        # If this is the first time tracking the person
        else:
            print(f'ID=1 -> Area: {current_area}, Center: ({person_x}, {person_y})')
            last_position = (person_x, person_y)
            last_area = current_area
                
    # Show frame 
    cv2.imshow("YOLOv8 + DeepSORT + Body Recognition", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  

cap.release()
cv2.destroyAllWindows()


ID=1 -> Area: 225336, Center: (164, 368)


2025-02-23 15:01:54.610 python[7863:45301] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-23 15:01:54.610 python[7863:45301] +[IMKInputSession subclass]: chose IMKInputSession_Modern


KeyboardInterrupt: 

## PROBLEMS ENCOUNTERED

- Deep sort causing different bounding area of the same person given different ids.
- If a person goes missing and comes back you end up with a different ID.
- If I cover the camera and uncover it I get an error but this should not be this case in our assignnment as the drone should never loose track of the person.
- Tuning the tolerance of face detection was very difficult