## Actual run that I found the best solution

Using:

- cv2 to capture the image
- Yolo to detect the person - include this in litreature review
- https://docs.ultralytics.com/models/yolov8/ 


In [9]:
# Initial things to use the drone - new features
# Tello does not work with out model 

from djitellopy import tello
from time import sleep
import cv2


me = tello.Tello()
# To connect 
me.connect()

# # To start 
me.streamon()

# # To takeoff
# me.takeoff()

# Open a window to display the video feed
while True:
    # Get the video frame
    frame = me.get_frame_read().frame

    # Resize the frame for better display (optional)
    frame = cv2.resize(frame, (640, 480))
    # frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)   
    # img = cv2.cvtColor(framw, cv2.COLOR_BGR2RGB)   


    # Show the frame
    cv2.imshow("Tello Camera", frame)

    

    cv2.waitKey(1)

    me.send_rc_control(10, 0, 0, 5)

    # me.land()

    

# # To control movement
# # me.send_rc_control(left/right, forward/backward, up/down, yaw_velocity)

# # To stop - for 5 seconds
# me.sleep(60)

# # To land
# me.land()

[INFO] tello.py - 129 - Tello instance was initialized. Host: '192.168.10.1'. Port: '8889'.


[INFO] tello.py - 438 - Send command: 'command'


KeyboardInterrupt: 

In [None]:
# This is more accurate - it tracks every part of a person

import cv2
from ultralytics import YOLO
import math
import logging

# These values need to be fine tuned
K_x = 0.05 # Left/Right movement scale
K_y = 0.1  # Forward/Backward movement scale
K_z = 0.02 # Up/Down movement scale

# Load YOLOv8 model
# Smallest YOLOv8 model
# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
model = YOLO("yolov8n.pt", verbose = False)  

# Read the drone's initial position
drone_x, drone_y, drone_z = 0, 0, 2 

last_position = None
last_area = None
pixel_threshold = 100

# Open webcam
cap = cv2.VideoCapture(0) 

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    # Define screen center
    frame_center_x = frame.shape[1] // 2  # Middle of frame
    frame_center_y = frame.shape[0] // 2 

    if not success:
        break

    # Inference on the frame 
    results = model(frame) 

    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0]) 
            # Confidence score 
            conf = box.conf[0].item()  
            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                # Center of bounding box
                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  
                bbox_height = y2 - y1
                # Calculate area
                area = (x2 - x1) * bbox_height
                
                # Depending on the size of the bounding box the drone will move
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) 
                # This is what the drone will follow
                # Draw center point
                cv2.circle(frame, (cx, cy), 5, (0, 255, 0), cv2.FILLED) 
                # Append the area and the center of the circle
                myPersonList.append((cx, cy))
                myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]


        # To avoid having constant outputs for example when the person moves only a bit 
        if last_position is not None and last_area is not None and current_area > 0:
            # Calculate distance moved in pixels
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            # Calculate the distance moved
            # Euclidean distance 
            distance_moved = math.sqrt(dx**2 + dy**2)

            # Calculate the change in area
            area_change = abs(current_area - last_area)

            # Check if the person moved more than the threshold to avoid having constant moves
            # This is only done to make the movement smoother - this was being calculated even if I barely move which we do not want 
            # We do not want to have a drone that would crash having constant changes in the position 
            if distance_moved > pixel_threshold:
                # Left/Right Movement (X-axis)
                drone_x += K_x * (person_x - frame_center_x)
                # Forward/Backward Movement (Y-axis)
                drone_y += K_y * ((last_area / current_area) - 1)
                # Up/Down Movement (Z-axis)
                drone_z += K_z * (frame_center_y - person_y)
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")
                print(f"New Drone Position: X={drone_x:.2f}, Y={drone_y:.2f}, Z={drone_z:.2f}")
                # Left/Right Movement
                # if person_x < frame_center_x - 50:
                #     direction = "left"
                #     print("Move Left")
                #     # Send command to drone: move left
                # elif person_x > frame_center_x + 50:
                #     direction = "right"  
                #     print("Move Right")
                #     # Send command to drone: move right

                # # Forward/Backward Movement
                # if myPersonListArea[i] < 5000:  # Adjust based on detection area
                #     direction = "forward"
                #     print("Move Forward")
                #     # Send command to drone: move forward
                # elif myPersonListArea[i] > 15000:
                #     direction = "backward"
                #     print("Move Backward")
                #     # Send command to drone: move backward


                # Update last position
                last_position = (person_x, person_y)
                last_area = current_area
        # Track the initial position of the person - assuming this is the space you want to have between the person and the drone 
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            # First detection, initialize last position
            last_position = (person_x, person_y)
            last_area = current_area
                


    cv2.imshow("Person Detection (YOLOv8)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Press 'q' to exit

cap.release()
cv2.destroyAllWindows()


Area: 49050, Center: (530, 607)


2025-02-18 16:46:42.494 python[8743:124274] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-18 16:46:42.494 python[8743:124274] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Area: 124440, Center: (536, 456)
Person moved 151.12 pixels, updating movement.
New Drone Position: X=-5.20, Y=-0.06, Z=0.08
Area: 183910, Center: (428, 447)
Person moved 108.37 pixels, updating movement.
New Drone Position: X=-15.80, Y=-0.09, Z=-1.66
Area: 172244, Center: (529, 424)
Person moved 103.59 pixels, updating movement.
New Drone Position: X=-21.35, Y=-0.09, Z=-2.94
Area: 125280, Center: (427, 445)
Person moved 104.14 pixels, updating movement.
New Drone Position: X=-32.00, Y=-0.05, Z=-4.64
Area: 142040, Center: (326, 443)
Person moved 101.02 pixels, updating movement.
New Drone Position: X=-47.70, Y=-0.06, Z=-6.30
Area: 130806, Center: (446, 435)
Person moved 120.27 pixels, updating movement.
New Drone Position: X=-57.40, Y=-0.05, Z=-7.80
Area: 123050, Center: (586, 445)
Person moved 140.36 pixels, updating movement.
New Drone Position: X=-60.10, Y=-0.05, Z=-9.50
Area: 71760, Center: (1158, 564)
Person moved 584.25 pixels, updating movement.
New Drone Position: X=-34.20, Y=0

KeyboardInterrupt: 

: 

PROBLEMS :
- If there are multiple people it might change the person it is tracking (Maybe for now I will test it with one person)
- It is difficult to find how much the drone should move and I am not sure whether I am doing it well


Considerations -- things to check 
- Latency: Ensure minimal delay between detecting the red spot and sending commands.
- Safety: Test in a controlled environment to ensure predictable movements.
- Camera Feed Access: If using the drone’s camera feed, ensure you can stream it to your processing device.


# Following a Person

# Deep Sort


In [None]:
# This is more accurate - it tracks every part of a person

import cv2
from ultralytics import YOLO
import math
import logging

# Load YOLOv8 model
# Smallest YOLOv8 model
# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
model = YOLO("yolov8n.pt", verbose = False)  

last_position = None
last_area = None
pixel_threshold = 100

# Open webcam
cap = cv2.VideoCapture(0) 

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()

    if not success:
        break

    # Inference on the frame 
    results = model(frame) 

    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID
            cls = int(box.cls[0]) 
            # Confidence score 
            conf = box.conf[0].item()  
            # Class 0 = "person", confidence > 80%
            if cls == 0 and conf > 0.8:  
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                # Center of bounding box
                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  
                bbox_height = y2 - y1
                # Calculate area
                area = (x2 - x1) * bbox_height
                
                # Depending on the size of the bounding box the drone will move
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2) 
                # This is what the drone will follow
                # Draw center point
                cv2.circle(frame, (cx, cy), 5, (0, 255, 0), cv2.FILLED) 
                # Append the area and the center of the circle
                myPersonList.append((cx, cy))
                myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]


        # To avoid having constant outputs for example when the person moves only a bit 
        if last_position is not None and last_area is not None and current_area > 0:
            # Calculate distance moved in pixels
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            # Calculate the distance moved
            # Euclidean distance 
            distance_moved = math.sqrt(dx**2 + dy**2)


            # Check if the person moved more than the threshold to avoid having constant moves
            # This is only done to make the movement smoother - this was being calculated even if I barely move which we do not want 
            # We do not want to have a drone that would crash having constant changes in the position 
            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
                print(f"Person moved {distance_moved:.2f} pixels, updating movement.")

                # Update last position
                last_position = (person_x, person_y)
                last_area = current_area
        # Track the initial position of the person - assuming this is the space you want to have between the person and the drone 
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            # First detection, initialize last position
            last_position = (person_x, person_y)
            last_area = current_area
                


    cv2.imshow("Person Detection (YOLOv8)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Press 'q' to exit

cap.release()
cv2.destroyAllWindows()


KeyboardInterrupt: 

In [2]:
import cv2
import math
import logging
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort


# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
# Load YOLOv8 model (Nano version)
model = YOLO("yolov8n.pt", verbose=False)

# Initialize Deep SORT Tracker 
tracker = DeepSort(max_age=10, n_init=3)

# Open webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# Tracking history
last_position = None
last_area = None
pixel_threshold = 100

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    # Run YOLOv8 inference
    # Inference on the frame 
    results = model(frame)

    # Prepare Deep Sort input format
    detections = []  
    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID and confidence score
            cls = int(box.cls[0])  
            conf = box.conf[0].item() 

            # Detect only people with confidence score of 0.8 or more
            if cls == 0 and conf > 0.8:  # Detect only people
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                width = x2 - x1 
                height = y2 - y1
                # Calculate area
                area = width * height  

                detections.append(([x1, y1, width, height], conf, cls))

    # Update Deep SORT Tracker
    tracked_objects = tracker.update_tracks(detections, frame=frame)

    for track in tracked_objects:
        if not track.is_confirmed():
            # Ignore unconfirmed tracks
            continue  

        track_id = track.track_id  
        x1, y1, x2, y2 = map(int, track.to_ltrb())  
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  

        # Draw bounding box and ID
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.circle(frame, (cx, cy), 5, (0, 0, 255), cv2.FILLED)
        cv2.putText(frame, f"ID: {track_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Append to lists
        # Append the area and the center of the circle
        myPersonList.append((cx, cy))
        myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]

        if last_position is not None and last_area is not None and current_area > 0:
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            distance_moved = math.sqrt(dx**2 + dy**2)

            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')

                last_position = (person_x, person_y)
                last_area = current_area
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            last_position = (person_x, person_y)
            last_area = current_area

    # Show the frame
    cv2.imshow("YOLOv8 + Deep SORT Tracking", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Exit on 'q'

cap.release()
cv2.destroyAllWindows()


Area: 797865, Center: (630, 391)


2025-02-18 22:02:24.584 python[34582:327987] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-18 22:02:24.584 python[34582:327987] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Area: 510545, Center: (505, 460)
Area: 717602, Center: (612, 403)


KeyboardInterrupt: 

In [1]:
import cv2
import math
import logging
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import numpy as np


# Added verbose = False to avoid having a lot of outputs when running the code for example this was outputting for each small detection
# 0: 384x640 1 person, 71.6ms
# Speed: 9.2ms preprocess, 71.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
# Suppress YOLOv8 logging
logging.getLogger("ultralytics").setLevel(logging.WARNING)
# Load YOLOv8 model (Nano version)
model = YOLO("yolov8n.pt", verbose=False)

tracker = DeepSort(
    max_age=30,  # Increased from 10 to maintain track through brief occlusions
    n_init=5,    # Need 5 consecutive detections to confirm track
    max_iou_distance=0.4,
    max_cosine_distance=0.3,  # Stricter appearance matching
    embedder_model_name="osnet_x1_0",  # Better ReID model
    embedder = "mobilenet",
    half=True  # Use FP16 for faster inference
)

# Open webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open webcam")
    exit()

# Tracking history
last_position = None
last_area = None
pixel_threshold = 100

# While the webcam is open
while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break
 
    # Sharpen the image to make better performance
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(frame, -1, kernel)
    # Run YOLOv8 inference
    # Inference on the frame
    results = model(sharpened)

    # Prepare Deep Sort input format
    detections = []  
    # If a person was detected
    # Lists to hold the person centre and area
    myPersonList = []
    myPersonListArea = []

    for r in results:
        for box in r.boxes:
            # Get class ID and confidence score
            cls = int(box.cls[0])  
            conf = box.conf[0].item() 

            # Detect only people with confidence score of 0.8 or more
            if cls == 0 and conf > 0.8:  # Detect only people
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0]) 
                width = x2 - x1 
                height = y2 - y1
                # Calculate area
                area = width * height  

                detections.append(([x1, y1, width, height], conf, cls, frame[y1:y2, x1:x2]))  # Add cropped image


    # Update Deep SORT Tracker
    tracked_objects = tracker.update_tracks(detections, frame=sharpened)

    for track in tracked_objects:
        if not track.is_confirmed():
            # Ignore unconfirmed tracks
            continue  

        track_id = track.track_id  
        x1, y1, x2, y2 = map(int, track.to_ltrb())  
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2  

        # Draw bounding box and ID
        cv2.rectangle(sharpened, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.circle(sharpened, (cx, cy), 5, (0, 0, 255), cv2.FILLED)
        cv2.putText(sharpened, f"ID: {track_id}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Append to lists
        # Append the area and the center of the circle
        myPersonList.append((cx, cy))
        myPersonListArea.append(area)

    # Select the largest detected person
    if myPersonList:
        i = myPersonListArea.index(max(myPersonListArea))
        person_x, person_y = myPersonList[i]        
        current_area = myPersonListArea[i]

        if last_position is not None and last_area is not None and current_area > 0:
            dx = person_x - last_position[0]
            dy = person_y - last_position[1]
            distance_moved = math.sqrt(dx**2 + dy**2)

            if distance_moved > pixel_threshold:
                print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')

                last_position = (person_x, person_y)
                last_area = current_area
        else:
            print(f'Area: {myPersonListArea[i]}, Center: {myPersonList[i]}')
            last_position = (person_x, person_y)
            last_area = current_area

    # Show the frame
    cv2.imshow("YOLOv8 + Deep SORT Tracking", sharpened)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break  # Exit on 'q'

cap.release()
cv2.destroyAllWindows()


Area: 700128, Center: (572, 406)


2025-02-18 22:30:13.860 python[38642:367097] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-18 22:30:13.860 python[38642:367097] +[IMKInputSession subclass]: chose IMKInputSession_Modern


KeyboardInterrupt: 