In [1]:
from ugot import ugot
import cv2
import numpy as np
import time

from IPython.display import clear_output

from ultralytics import YOLO

got = ugot.UGOT()
got.initialize("192.168.1.217")

192.168.1.217:50051


In [10]:
# !pip install ultralytics --user

^C


First, we need to download the dataset. 
1. Copy the contents of [VOC.yaml](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/VOC.yaml) into a new file in the same folder as this notebook, called `VOC_enhanced.yaml`.
2. Open `VOC_enhanced.yaml` and add a line for the new object, so the file will have (for example) `20: candle`.
3. Delete all references to VOC 2012 in the yaml, to limit download size.
4. Run the cell below. This will download the images and labels for the original 19 objects, but we will have to add the new object data ourselves. We will do this in the next step.

Note that in `model.train`, we just have one epoch as we will retrain the model later with the new object data included. You can stop the kernel once the data is downloaded, but this means the object detection will not work yet.

In [4]:
# Download dataset - change path as needed

model = YOLO("yolo11n.pt")
# print("Loaded model successfully")  

# VOC 2007 is a reasonable size, compared to COCO
model.train(data="VOC_enhanced.yaml", epochs=1, imgsz=640)

Ultralytics 8.3.231  Python-3.12.3 torch-2.7.1+cpu CPU (Intel Core i7-1065G7 1.30GHz)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=VOC_enhanced.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=1, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train15, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plots=True, p

KeyboardInterrupt: 

In [4]:
# -------------------------------------------------------
# Helper: Draw bounding boxes
# -------------------------------------------------------
def draw_detections(frame, results):
    for r in results:
        boxes = r.boxes  # bounding boxes

        for box in boxes:
            # xyxy format: [x1, y1, x2, y2]
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)

            # Confidence & label
            conf = float(box.conf[0])
            cls_id = int(box.cls[0])
            label = r.names[cls_id]

            # Draw rectangle
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Label text
            text = f"{label} {conf:.2f}"
            cv2.putText(frame, text, (x1, y1 - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                        (0, 255, 0), 2)
    return frame

In [6]:
# Visualize bounding boxes with live video feed

while True:
    frame = got.read_camera_data()
    if frame is not None:
        nparr = np.frombuffer(frame, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

        # Run YOLO detection
        results = model(img, verbose=False)

        # Draw output
        output = draw_detections(img, results)

        # Show
        cv2.imshow("YOLO Detection", output)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break   

cv2.destroyAllWindows()


To finetune the YOLO dataset with our own images, we need to first capture some images of the object we want to detect.
The following cell will allow the UGOT to auto capture an image every half second.
Here are some tips for good training data:
- Capture at least 50 images. The more, the better, but remember you will also need to annotate them.
- Move the UGOT / object around so that you get images when the object is near/far, under different lighting conditions, partially obscured, etc.

In [None]:
# Save images from UGOT video feed at regular time intervals
from ugot import ugot
import cv2
import numpy as np
import time
import os

SAVE_DIR = "captured_demo" # change folder for demo purposes
os.makedirs(SAVE_DIR, exist_ok=True)

got = ugot.UGOT()
got.initialize("192.168.1.217")
got.open_camera()

counter = 0     # change this to one after the last captured image name to avoid overwriting images
interval = 0.5   # seconds between captures

print("Auto-capturing images. Press 'q' to stop.")

last_time = time.time()

try:
    while True:
        frame = got.read_camera_data()
        if frame is not None:
            nparr = np.frombuffer(frame, np.uint8)
            img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

            cv2.imshow("UGOT Camera", img)

            # Auto-save
            if time.time() - last_time >= interval:
                filename = f"{SAVE_DIR}/coffee_img_{counter:04d}.jpg"
                cv2.imwrite(filename, img)
                print(f"Saved: {filename}")
                counter += 1
                last_time = time.time()

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

except KeyboardInterrupt:
    pass

cv2.destroyAllWindows()
print("Done.")


192.168.1.217:50051
Auto-capturing images. Press 'q' to stop.
Saved: captured_coffee/coffee_img_0000.jpg
Saved: captured_coffee/coffee_img_0001.jpg
Saved: captured_coffee/coffee_img_0002.jpg
Saved: captured_coffee/coffee_img_0003.jpg
Saved: captured_coffee/coffee_img_0004.jpg
Saved: captured_coffee/coffee_img_0005.jpg
Saved: captured_coffee/coffee_img_0006.jpg
Saved: captured_coffee/coffee_img_0007.jpg
Saved: captured_coffee/coffee_img_0008.jpg
Saved: captured_coffee/coffee_img_0009.jpg
Saved: captured_coffee/coffee_img_0010.jpg
Saved: captured_coffee/coffee_img_0011.jpg
Saved: captured_coffee/coffee_img_0012.jpg
Saved: captured_coffee/coffee_img_0013.jpg
Saved: captured_coffee/coffee_img_0014.jpg
Saved: captured_coffee/coffee_img_0015.jpg
Saved: captured_coffee/coffee_img_0016.jpg
Saved: captured_coffee/coffee_img_0017.jpg
Saved: captured_coffee/coffee_img_0018.jpg
Saved: captured_coffee/coffee_img_0019.jpg
Saved: captured_coffee/coffee_img_0020.jpg
Saved: captured_coffee/coffee_img_0

After capturing the images, create an account at roboflow.com. 
1. Create a new project, select `Traditional` (not Rapid) tool and `Object Detection` project type, and upload your images.
2. Use the tools to annotate your images. Ensure that the bounding boxes are tight.
3. Apply any augmentations in the Dataset tab. Alternatively, you can not apply augmentations here, but instead modify the `augments` parameter in `model.train` below.
4. Download the dataset in the YOLOv11 format (select "Download zip to computer", download, and unzip).

After downloading, we still need to change the index of the text files. Change the path of `label_folder` below to the `label` folder of your new object data, and run the cell.

In [32]:
# Helper script - change new object .txt files index number
import os

# folder containing your YOLO .txt label files
# label_folder = "roboflow/train/labels"
label_folder = "roboflow/valid/labels"

old_class = "0"
new_class = "20" # original VOC has up to index 19

for filename in os.listdir(label_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(label_folder, filename)

        with open(file_path, "r") as f:
            lines = f.readlines()

        new_lines = []
        for line in lines:
            parts = line.strip().split()
            if parts:
                if parts[0] == old_class:
                    parts[0] = new_class
            new_lines.append(" ".join(parts) + "\n")

        with open(file_path, "w") as f:
            f.writelines(new_lines)

        print(f"Updated: {filename}")

print("Done!")


Updated: img_0092_jpg.rf.26c68416011a321e276ccc3f27156583.txt
Updated: img_0092_jpg.rf.99c93d9bbf12fad2d39ffafdbfa7329e.txt
Updated: img_0092_jpg.rf.db2785e482cce70eb6f38808026dbd31.txt
Updated: img_0093_jpg.rf.10cc0e2f5e51337059cfca80fc0a8478.txt
Updated: img_0093_jpg.rf.336d7dbc816aaa1548ef815bbd4db500.txt
Updated: img_0093_jpg.rf.3419739be2a800845c81e31a778239a4.txt
Updated: img_0094_jpg.rf.4e9a402276c5b1db54eab97d802acbc1.txt
Updated: img_0094_jpg.rf.849cb0a655b7292a84ab782087acc06b.txt
Updated: img_0094_jpg.rf.8f01ea32f69fb76993e65a4657411088.txt
Done!


Now, merge the datasets: copy the train images, train labels, valid images, and valid labels from your new object folder to their respective VOC folders.

Once you are done with that, retrain the model. This time we have several epochs for better performance. 

Warning: this will likely take a few hours to train without a GPU.

In [7]:
# model.train(data="VOC_enhanced.yaml", epochs=3, imgsz=640, save=True, resume=True)

Ultralytics 8.3.231  Python-3.12.3 torch-2.7.1+cpu CPU (Intel Core i7-1065G7 1.30GHz)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=VOC_enhanced.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=3, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train16, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plots=True, p

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x00000214071109B0>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0

Now test it with the new object (seems to only work well on 5G wifi?):

In [None]:
import numpy as np
import cv2
from ugot import ugot
from ultralytics import YOLO

got = ugot.UGOT()
got.initialize("192.168.1.217")
got.open_camera()

# model = YOLO("best.pt")
model = YOLO("best_VOC_candle.pt")

while True:
    frame = got.read_camera_data()
    if frame is not None:
        nparr = np.frombuffer(frame, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

        # Run YOLO detection
        results = model(img, verbose=False)

        # Draw output
        output = draw_detections(img, results)
    
        # Show
        cv2.imshow("YOLO Detection - With Custom Object", output)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cv2.destroyAllWindows()

192.168.1.217:50051


In [None]:
def line_follow():
    got.load_models(["line_recognition"])
    got.set_track_recognition_line(line_type = 0)

    line_info = got.get_single_track_total_info()  # list: [offset, type, x, y]
    offset = line_info[0]
    line_type = line_info[1]
    while line_type > 0: # there is some kind of line in front of the robot
        line_info = got.get_single_track_total_info()
        offset = line_info[0]
        line_type = line_info[1]
    
        degrees = int(offset / 4)
        got.mecanum_move_xyz(0, 20, degrees)
        time.sleep(0.1)
        
    
    got.mecanum_stop()

The UGOT will follow a line continuously. 
At an intersection, the UGOT will stop and check for an object.
If the UGOT sees a certain object, it will follow the directions, e.g. choose the left path if it sees a candle.

In [5]:
def line_follow_camera():
    got.open_camera()

    got.load_models(["line_recognition"])
    got.set_track_recognition_line(line_type = 0)

    line_info = got.get_single_track_total_info()  # list: [offset, type, x, y]
    offset = line_info[0]
    line_type = line_info[1]
    
    try:
        while True:
            frame = got.read_camera_data()
            if frame is not None:
                nparr = np.frombuffer(frame, np.uint8)
                img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

                # Run YOLO detection
                results = model(img, verbose=False)

                # Draw output
                output = draw_detections(img, results)
            
                # Show
                cv2.imshow("YOLO Detection - With Custom Object", output)
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break  
            line_info = got.get_single_track_total_info()
            offset = line_info[0]
            line_type = line_info[1]

            if line_type != 1: # not single line
                return line_type, results
        
            degrees = int(offset / 4)
            got.mecanum_move_xyz(0, 10, degrees)
            time.sleep(0.1)
        
    finally:
        got.mecanum_stop()
        cv2.destroyAllWindows()

In [None]:
# Helper functions: pick up token
def ap_recog():
    # step 1: move right until apriltag recognised
    while True:
        AP_info = got.get_apriltag_total_info()
        if not AP_info: # shorter than AP_info == []
            got.mecanum_move_xyz(10, 0, 0)
        else:
            got.mecanum_stop()
            break
    print("Apriltag recognized.")

    # step 2: centralize - keep moving right until tag is in the centre
    while True:
        AP_info = got.get_apriltag_total_info()
        if AP_info:
            x_coord = AP_info[0][1] # x-coordinate of the centre of apriltag
            if x_coord < 330:
                got.mecanum_move_xyz(10, 0, 0)
            else:
                got.mecanum_stop()
                break
    print("Apriltag centralised.")

    # step 3: approach apriltag to pick it up
    while True:
        AP_info = got.get_apriltag_total_info()
        if AP_info:
            x_coord = AP_info[0][1]
            dist = AP_info[0][6]
            if x_coord < 290:
                 got.mecanum_move_xyz(-3, 3, 0)
            elif x_coord > 350:
                got.mecanum_move_xyz(3, 3, 0)
            elif dist > 0.12:
                got.mecanum_move_xyz(0, 3, 0)
            else:
                got.mecanum_stop()
                break
        else:
            got.mecanum_stop()
    print("Stopped.")

def pickup_ap():
    got.mechanical_clamp_release()
    time.sleep(1)
    got.mechanical_joint_control(0, 0, -70, 800) #down - for apriltag
    time.sleep(1)
    got.mechanical_clamp_close()
    time.sleep(2)
    got.mechanical_joint_control(0, 30, -50, 800) #up

def put_ap():
    got.mechanical_joint_control(-90, 30, -50, 800)
    time.sleep(1)
    got.mechanical_joint_control(-90, -20, -30, 800)
    time.sleep(1)
    got.mechanical_clamp_release()
    time.sleep(2)

In [17]:
# MAIN CODE
model = YOLO("best.pt")

num_intersections = 0

while num_intersections < 2:
    line_type, results = line_follow_camera()
    if line_type == 2:
        num_intersections += 1
        for r in results:
            # print(r.boxes)
            detected = r.boxes.cls.tolist()
            if 0 in detected: # candle
                got.mecanum_turn_speed_times(2, 40, 20, 2)
            elif 1 in detected: # token
                ap_recog()
                pickup_ap()
                got.mecanum_turn_speed_times(2, 40, 180, 2)
            else:
                got.mecanum_turn_speed_times(3, 40, 20, 2)

        #TODO: add logic for turning or whatever
     
got.mecanum_stop()

KeyboardInterrupt: 