# 1. Import Dependencies

In [None]:
# Import opencv
import cv2 

# Import uuid
import uuid

# Import Operating System
import os

# Import time
import time

# 2. Define Images to Collect

In [None]:
labels = ['small_yellow_cube','medium_yellow_cube','big_yellow_cube','small_red_cube','medium_red_cube','big_red_cube']
number_imgs = 70

# 3. Setup Folders 

In [None]:
IMAGES_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'collectedimages')

In [None]:
if not os.path.exists(IMAGES_PATH):
    if os.name == 'posix':
        !mkdir -p {IMAGES_PATH}
    if os.name == 'nt':
         !mkdir {IMAGES_PATH}
for label in labels:
    path = os.path.join(IMAGES_PATH, label)
    if not os.path.exists(path):
        !mkdir {path}

# 4. Capture Images

In [None]:
for label in labels:
    cap = cv2.VideoCapture(1)
    print('Collecting images for {}'.format(label))
    time.sleep(5)
    for imgnum in range(number_imgs):
        print('Collecting image {}'.format(imgnum))
        ret, frame = cap.read()
        imgname = os.path.join(IMAGES_PATH, label, label + '.' + '{}.jpg'.format(str(uuid.uuid1())))
        cv2.imwrite(imgname, frame)
        cv2.imshow('frame', frame)
        time.sleep(2)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
cap.release()
cv2.destroyAllWindows()


# 5. Image Labelling

In [None]:
LABELIMG_PATH = os.path.join('Tensorflow', 'labelimg')

In [None]:
if not os.path.exists(LABELIMG_PATH):
    !mkdir {LABELIMG_PATH}
    !git clone https://github.com/tzutalin/labelImg {LABELIMG_PATH}

In [None]:
if os.name == 'posix':
    !make qt5py3
if os.name =='nt':
    !cd {LABELIMG_PATH} && pyrcc5 -o libs/resources.py resources.qrc

In [None]:
!cd {LABELIMG_PATH} && python labelImg.py

In [None]:
import os,shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm

classes = ['small_yellow_cube','medium_yellow_cube','big_yellow_cube','small_red_cube','medium_red_cube','big_red_cube']  

input_dir = "Tensorflow/workspace/images/collectedimages"
output_dir = "dataset"

if not os.path.exists(output_dir):
    os.makedirs(os.path.join(output_dir, "images"))
    os.makedirs(os.path.join(output_dir, "labels"))

for label in classes:
    img_dir = os.path.join(input_dir, label)
    for file in tqdm(os.listdir(img_dir)):
        if file.endswith(".xml"):
            xml_path = os.path.join(img_dir, file)
            tree = ET.parse(xml_path)
            root = tree.getroot()

            img_name = root.find("filename").text
            img_path = os.path.join(img_dir, img_name)

            shutil.copy(img_path, os.path.join(output_dir, "images", img_name))

            w = int(root.find("size/width").text)
            h = int(root.find("size/height").text)

            txt_path = os.path.join(output_dir, "labels", img_name.replace(".jpg", ".txt"))
            with open(txt_path, "w") as f:
                for obj in root.findall("object"):
                    cls = obj.find("name").text
                    cls_id = classes.index(cls)

                    xmlbox = obj.find("bndbox")
                    xmin = int(xmlbox.find("xmin").text)
                    ymin = int(xmlbox.find("ymin").text)
                    xmax = int(xmlbox.find("xmax").text)
                    ymax = int(xmlbox.find("ymax").text)

                    # YOLO format: x_center y_center width height (normalized)
                    x_center = (xmin + xmax) / 2.0 / w
                    y_center = (ymin + ymax) / 2.0 / h
                    bw = (xmax - xmin) / w
                    bh = (ymax - ymin) / h

                    f.write(f"{cls_id} {x_center} {y_center} {bw} {bh}\n")


In [None]:
from sklearn.model_selection import train_test_split
import glob, shutil

images = glob.glob("dataset/images/*.jpg")
labels = glob.glob("dataset/labels/*.txt")

train_imgs, val_imgs = train_test_split(images, test_size=0.2, random_state=42)

def move_files(file_list, target_img_dir, target_lbl_dir):
    os.makedirs(target_img_dir, exist_ok=True)
    os.makedirs(target_lbl_dir, exist_ok=True)
    for img in file_list:
        lbl = img.replace("images", "labels").replace(".jpg", ".txt")
        shutil.copy(img, target_img_dir)
        shutil.copy(lbl, target_lbl_dir)

move_files(train_imgs, "dataset/images/train", "dataset/labels/train")
move_files(val_imgs, "dataset/images/val", "dataset/labels/val")


In [None]:
yaml_content = """nc: 6
path: D:/YOLO Object Detection/dataset
train: images/train
val: images/val

names:
  0: small_yellow_cube
  1: medium_yellow_cube
  2: big_yellow_cube
  3: small_red_cube
  4: medium_red_cube
  5: big_red_cube
"""

with open("dataset/dataset.yaml", "w") as f:
    f.write(yaml_content)

print("dataset.yaml created successfully!")

# 6. Training the model

In [None]:
!yolo train data=dataset/dataset.yaml model=yolov8n.pt epochs=100 imgsz=320

# 7. Extract real coordinates of objects

You should run camera_calibration.ipynb to know parameters such as camera_matix, dist_coeffs, R_cam, t_cam 

In [None]:
from ultralytics import YOLO
import cv2
import numpy as np

model = YOLO("runs/detect/train2/weights/best.pt")
CAP_INDEX = 1

camera_matrix = np.array([[546.25061067,   0.,         317.92410558],
                          [  0.,         548.85987516, 239.71830120],
                          [  0.,           0.,           1.        ]])

dist_coeffs = np.array([[ 0.23208356, -1.03167154, -0.00369298, -0.00400159, 2.99040251]])

R_cam = np.array([[ 0.99418228,  0.03701466, -0.10115093],
                  [-0.03531900,  0.99920477,  0.01850404],
                  [ 0.10175541, -0.01482384,  0.99469899]])

t_cam = np.array([[-1.98635758],
                  [-1.26039358],
                  [ 6.79782214]])

DIST_THRESH = 50  

SCALE_FACTOR = 2.4   # adjust based on real-world measurements

#To verify that the coordinates are correct, we compare the real-world distance between two points with the distance between the corresponding points in the image
#If they are not equal -> Do scaling 

cap = cv2.VideoCapture(CAP_INDEX)
if not cap.isOpened():
    raise RuntimeError(f"Cannot open camera {CAP_INDEX}")

object_id = 0
centers = {}

def pixel_to_world(u_px, v_px, K, dist, R, t):
    pts = np.array([[[u_px, v_px]]], dtype=np.float32)
    und = cv2.undistortPoints(pts, K, dist, P=K)
    u_corr, v_corr = und[0,0]

    p = np.array([u_corr, v_corr, 1.0]).reshape(3,1)
    M = K @ R[:, :2]
    b = K @ t

    A = np.hstack((M, -p))
    rhs = -b

    try:
        sol = np.linalg.solve(A, rhs)
    except np.linalg.LinAlgError:
        sol, *_ = np.linalg.lstsq(A, rhs, rcond=None)

    X = float(sol[0]) * SCALE_FACTOR
    Y = float(sol[1]) * SCALE_FACTOR
    s = float(sol[2])

    return X, Y, s, (u_corr, v_corr)

def worldXY_to_cameraXYZ(X_world, Y_world, R, t):
    Pw = np.array([[X_world], [Y_world], [0.0]])
    Pc = R @ Pw + t
    return float(Pc[0]), float(Pc[1]), float(Pc[2])

print("Starting capture. Press 'q' to quit.")
while True:
    ret, frame = cap.read()
    if not ret:
        print("No frame, exiting")
        break

    im_h, im_w = frame.shape[:2]
    results = model(frame)

    for r in results:
        for box in r.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls = int(box.cls[0])
            conf = float(box.conf[0])

            cx = int((x1 + x2) / 2)
            cy = int((y1 + y2) / 2)

            found = None
            for oid, (px, py) in centers.items():
                if abs(cx - px) < DIST_THRESH and abs(cy - py) < DIST_THRESH:
                    found = oid
                    break
            if found is None:
                object_id += 1
                found = object_id
            centers[found] = (cx, cy)

            Xw, Yw, scale_s, (u_corr, v_corr) = pixel_to_world(cx, cy, camera_matrix, dist_coeffs, R_cam, t_cam)
            Xc, Yc, Zc = worldXY_to_cameraXYZ(Xw, Yw, R_cam, t_cam)

            cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
            cv2.circle(frame, (cx, cy), 5, (0,0,255), -1)
            cv2.line(frame, (0, cy), (im_w, cy), (0,0,255), 1)
            cv2.line(frame, (cx, 0), (cx, im_h), (0,0,255), 1)

            txt = f"ID:{found} ({cx},{cy}) class:{model.names[cls]} conf:{conf:.2f}"
            world_txt = f"World(X,Y): {Xw:.2f}, {Yw:.2f}"
            cam_txt = f"CamXYZ: {Xc:.2f}, {Yc:.2f}, {Zc:.2f}"

            cv2.putText(frame, txt, (x1, y1-30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
            cv2.putText(frame, world_txt, (x1, y1-14), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,255), 1)
            cv2.putText(frame, cam_txt, (x1, y1-2), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (200,200,200), 1)

            print(f"ID {found} | pixel=({cx},{cy}) undist=({u_corr:.1f},{v_corr:.1f}) | "
                  f"World(X,Y)=({Xw:.3f},{Yw:.3f}) | CamXYZ=({Xc:.3f},{Yc:.3f},{Zc:.3f})")

    cv2.imshow("YOLO + Calibration -> World coords", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
