Step 1: Install YOLOv5 and necessary libraries

In [2]:
# Step 1: Clone YOLOv5 and install requirements (only run once)
!git clone https://github.com/ultralytics/yolov5
%cd yolov5
!pip install -r requirements.txt


c:\Users\admin\Downloads\Code\ObjectDetection\yolov5


fatal: destination path 'yolov5' already exists and is not an empty directory.
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


Step 2: Generate YOLO-Formatted Subset for Training


In [3]:
import os
import shutil
import json
from tqdm import tqdm
from collections import defaultdict

# Set paths
subset_root = '../coco_subset'
output_root = '../coco_yolo_dataset'
os.makedirs(output_root, exist_ok=True)

# Remove existing directories
if os.path.exists(output_root):
    shutil.rmtree(output_root)
print("Old output directory deleted.")

# Create new directory structure
for split in ['train', 'val']:
    os.makedirs(os.path.join(output_root, 'images', split), exist_ok=True)
    os.makedirs(os.path.join(output_root, 'labels', split), exist_ok=True)

# Load annotations
annotation_files = {
    'train': os.path.join(subset_root, 'annotations', 'instances_train2017.json'),
    'val': os.path.join(subset_root, 'annotations', 'instances_val2017.json')
}

for split in ['train', 'val']:
    with open(annotation_files[split], 'r') as f:
        coco_data = json.load(f)

    images = coco_data['images']
    annotations = coco_data['annotations']
    categories = coco_data['categories']

    image_id_to_filename = {img['id']: img['file_name'] for img in images}
    image_size_dict = {img['id']: (img['width'], img['height']) for img in images}
    category_id_to_name = {cat['id']: cat['name'] for cat in categories}
    category_ids = sorted([cat['id'] for cat in categories])
    category_id_to_index = {cat_id: idx for idx, cat_id in enumerate(category_ids)}

    annotations_by_image = defaultdict(list)
    for ann in annotations:
        annotations_by_image[ann['image_id']].append(ann)

    for img in tqdm(images, desc=f'Processing {split}'):
        img_id = img['id']
        file_name = img['file_name']
        img_w, img_h = image_size_dict[img_id]

        # Copy image
        src_img_path = os.path.join(subset_root, f'{split}2017', file_name)
        dst_img_path = os.path.join(output_root, 'images', split, file_name)
        shutil.copyfile(src_img_path, dst_img_path)

        # Generate label
        label_lines = []
        for ann in annotations_by_image.get(img_id, []):
            category_id = ann['category_id']
            if category_id not in category_id_to_index:
                continue
            bbox = ann['bbox']
            x_center = (bbox[0] + bbox[2] / 2) / img_w
            y_center = (bbox[1] + bbox[3] / 2) / img_h
            width = bbox[2] / img_w
            height = bbox[3] / img_h
            label = category_id_to_index[category_id]
            label_lines.append(f"{label} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")

        if label_lines:
            label_file = file_name.replace('.jpg', '.txt')
            label_path = os.path.join(output_root, 'labels', split, label_file)
            with open(label_path, 'w') as f:
                f.write("\n".join(label_lines))

print("YOLO dataset generation complete.")


Old output directory deleted.


Processing train: 100%|██████████| 5000/5000 [00:33<00:00, 150.18it/s]
Processing val: 100%|██████████| 1000/1000 [00:06<00:00, 160.02it/s]

YOLO dataset generation complete.





Step 3: Create data configuration file smartcity.yaml

In [4]:
# Generate YAML config for YOLOv5 training
yaml_path = 'data/smartcity.yaml'
yaml_lines = [
    f"train: {output_root}/images/train",
    f"val: {output_root}/images/val",
    f"nc: {len(category_ids)}",
    "names: [" + ", ".join(f"'{category_id_to_name[cid]}'" for cid in category_ids) + "]"
]

os.makedirs('data', exist_ok=True)
with open(yaml_path, 'w') as f:
    f.write("\n".join(yaml_lines))

print(f"smartcity.yaml created with {len(category_ids)} classes.")


smartcity.yaml created with 8 classes.


Step 4: YOLOv5 model training and testing

Step 4.1: Start model training

In [9]:

# Start training
!python train.py \
  --img 640 \
  --batch 2 \
  --epochs 5 \
  --data data/smartcity.yaml \
  --weights yolov5s.pt \
  --project runs/train \
  --name smartcity_yolo5_test \
  --exist-ok


[34m[1mtrain: [0mweights=yolov5s.pt, cfg=, data=data/smartcity.yaml, hyp=data\hyps\hyp.scratch-low.yaml, epochs=5, batch_size=2, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, evolve_population=data\hyps, resume_evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=smartcity_yolo5_test, exist_ok=True, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest, ndjson_console=False, ndjson_file=False
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 
YOLOv5  v7.0-419-g50f68be4 Python-3.10.4 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4070 SUPER, 12282MiB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.

Step 4.2: Inference Detection and Visualization of Prediction Results

In [12]:
# must be in the yolov5 folder
!python detect.py \
  --weights runs/train/smartcity_yolo5_test/weights/best.pt \
  --img 640 \
  --conf 0.25 \
  --source ../coco_yolo_dataset/images/val \
  --project runs/detect \
  --name smartcity_yolo5_test \
  --exist-ok


[34m[1mdetect: [0mweights=['runs/train/smartcity_yolo5_test/weights/best.pt'], source=../coco_yolo_dataset/images/val, data=data\coco128.yaml, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_format=0, save_csv=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=smartcity_yolo5_test, exist_ok=True, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1
YOLOv5  v7.0-419-g50f68be4 Python-3.10.4 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4070 SUPER, 12282MiB)

Fusing layers... 
Model summary: 157 layers, 7031701 parameters, 0 gradients, 15.8 GFLOPs
image 1/1000 C:\Users\admin\Downloads\Code\ObjectDetection\coco_yolo_dataset\images\val\000000000139.jpg: 448x640 1 person, 21.7ms
image 2/1000 C:\Users\admin\Downloads\Code\ObjectDetection\coco_yolo_dataset\images\val\000000000724.jpg: 

Step 4.3：video

In [13]:
import os
import sys
import cv2
import torch
from pathlib import Path
from collections import Counter

# === Setting the path ===
yolov5_path = r"C:\Users\admin\Downloads\Code\ObjectDetection\yolov5"
sys.path.insert(0, yolov5_path)

from models.common import DetectMultiBackend
from utils.general import non_max_suppression
from utils.plots import Annotator, colors

# === Loading models ===
weights_path = os.path.join(yolov5_path, "runs", "train", "smartcity_yolo5_test", "weights", "best.pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DetectMultiBackend(weights_path, device=device)
print(f"Model loaded on {device}")

# === Video Path ===
videos_dir = r"C:\Users\admin\Downloads\Code\ObjectDetection\videos"
video_files = [f for f in os.listdir(videos_dir) if f.lower().endswith(".mp4")]

# === Output directory ===
output_dir = os.path.join(yolov5_path, "runs", "inference")
os.makedirs(output_dir, exist_ok=True)

# === pad images to multiples of 32 ===
def pad_to_multiple_of_32(img):
    h, w, _ = img.shape
    new_h = ((h + 31) // 32) * 32
    new_w = ((w + 31) // 32) * 32
    return cv2.copyMakeBorder(img, 0, new_h - h, 0, new_w - w, cv2.BORDER_CONSTANT, value=[114, 114, 114])

# === Video processing ===
for video_file in video_files:
    video_path = os.path.join(videos_dir, video_file)
    print(f"\n Processing: {video_path}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Cannot open video: {video_file}")
        continue

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    out_path = os.path.join(output_dir, f"annotated_{video_file}")
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))

    all_class_ids = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        padded_img = pad_to_multiple_of_32(img_rgb)

        tensor_img = torch.from_numpy(padded_img).permute(2, 0, 1).float() / 255.0
        if tensor_img.ndimension() == 3:
            tensor_img = tensor_img.unsqueeze(0)
        tensor_img = tensor_img.to(device)

        with torch.no_grad():
            pred_raw = model(tensor_img, augment=False, visualize=False)
            pred = non_max_suppression(pred_raw, conf_thres=0.25, iou_thres=0.45)[0]

        # Visualization
        annotator = Annotator(frame.copy(), line_width=2, example=str(model.names))

        if pred is not None and len(pred):
            pred = pred.cpu()
            for det in pred:
                x1, y1, x2, y2, conf, cls = det[:6]
                cls_id = int(cls.item())
                conf_val = conf.item()
                label = f"{model.names[cls_id]} {conf_val:.2f}"
                annotator.box_label([x1, y1, x2, y2], label, color=colors(cls_id, True))
                all_class_ids.append(cls_id)

        annotated = annotator.result()
        out.write(annotated)

    cap.release()
    out.release()
    print(f"Output saved to: {out_path}")

    # Category statistics
    counts = Counter(all_class_ids)
    print("Detection Summary:")
    for cls_id, count in counts.items():
        print(f"  - {model.names[cls_id]}: {count} times")


Fusing layers... 
Model summary: 157 layers, 7031701 parameters, 0 gradients, 15.8 GFLOPs


Model loaded on cuda

 Processing: C:\Users\admin\Downloads\Code\ObjectDetection\videos\videoplayback.mp4
Output saved to: C:\Users\admin\Downloads\Code\ObjectDetection\yolov5\runs\inference\annotated_videoplayback.mp4
Detection Summary:
  - traffic light: 3755 times
  - car: 4059 times
  - person: 500 times
  - truck: 5 times
