In [1]:
import cv2
import numpy as np
import time
from ultralytics import YOLO
import tkinter as tk
from tkinter import filedialog
import os

In [2]:
models = ['yolov8n.pt', 'yolov8s.pt', 'yolov8m.pt', 'yolov8l.pt', 'yolov8x.pt']

In [3]:
def calculate_iou(box1, box2):
    # Calculate intersection over union
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - intersection

    return intersection / union if union > 0 else 0

In [4]:
def calculate_map(detections, num_classes=1, iou_threshold=0.5, confidence_threshold=0.5):
    if not detections:
        return 0  # Return 0 if there are no detections

    average_precisions = []
    
    for class_id in range(num_classes):
        class_detections = [d for d in detections if d['class'] == class_id and d['confidence'] >= confidence_threshold]
        if not class_detections:
            continue
        
        class_detections.sort(key=lambda x: x['confidence'], reverse=True)
        
        num_gt = len(class_detections)  # Assuming each detection corresponds to a ground truth
        true_positives = np.zeros(len(class_detections))
        false_positives = np.zeros(len(class_detections))
        
        detected_gt = set()
        
        for i, detection in enumerate(class_detections):
            best_iou = 0
            best_gt_idx = -1
            
            for j, gt in enumerate(class_detections):
                if j in detected_gt:
                    continue
                
                iou = calculate_iou(detection['bbox'], gt['bbox'])
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = j
            
            if best_iou >= iou_threshold:
                if best_gt_idx not in detected_gt:
                    true_positives[i] = 1
                    detected_gt.add(best_gt_idx)
                else:
                    false_positives[i] = 1
            else:
                false_positives[i] = 1
        
        cumulative_tp = np.cumsum(true_positives)
        cumulative_fp = np.cumsum(false_positives)
        
        recalls = cumulative_tp / num_gt
        precisions = cumulative_tp / (cumulative_tp + cumulative_fp)
        
        # Compute average precision
        ap = 0
        for t in np.arange(0, 1.1, 0.1):
            if np.sum(recalls >= t) == 0:
                p = 0
            else:
                p = np.max(precisions[recalls >= t])
            ap = ap + p / 11
        average_precisions.append(ap)
    
    if not average_precisions:
        return 0  # Return 0 if no class had any detections
    
    return np.mean(average_precisions)


In [5]:
def detect_people(frame, model, conf_threshold=0.5, iou_threshold=0.5):
    start_time = time.time()
    results = model(frame, conf=conf_threshold, iou=iou_threshold)
    inference_time = time.time() - start_time
    
    person_count = 0
    detections = []
    for r in results:
        boxes = r.boxes
        for box in boxes:
            if box.cls == 0:  # Class 0 adalah orang dalam dataset COCO
                person_count += 1
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                conf = float(box.conf)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Person: {conf:.2f}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,255,0), 2)
                detections.append({
                    'class': 0,
                    'confidence': conf,
                    'bbox': [x1, y1, x2, y2]
                })
    
    cv2.putText(frame, f"People: {person_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
    return frame, person_count, inference_time, detections

In [6]:
def process_webcam():
    cap = cv2.VideoCapture(0)
    
    for model_name in models:
        print(f"\nMenggunakan model: {model_name}")
        model = YOLO(model_name)
        
        frame_count = 0
        total_inference_time = 0
        total_person_count = 0
        all_detections = []
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            frame_count += 1
            frame_result, person_count, inference_time, detections = detect_people(frame, model)
            
            total_inference_time += inference_time
            total_person_count += person_count
            all_detections.extend(detections)
            
            cv2.putText(frame_result, f"Model: {model_name}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
            cv2.putText(frame_result, f"Inference Time: {inference_time:.4f}s", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
            
            cv2.imshow('Webcam Detection', frame_result)
            
            if cv2.waitKey(1) & 0xFF == ord('q') or frame_count >= 100:  # Process 100 frames
                break
        
        avg_inference_time = total_inference_time / frame_count
        avg_person_count = total_person_count / frame_count
        estimated_map = calculate_map(all_detections)
        
        print(f"Rata-rata jumlah orang terdeteksi per frame: {avg_person_count:.2f}")
        print(f"Rata-rata inference time: {avg_inference_time:.4f} detik")
        print(f"Estimated mAP: {estimated_map:.4f}")
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

In [7]:
def process_image():
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename()
    
    if not file_path:
        print("Tidak ada file yang dipilih.")
        return
    
    img = cv2.imread(file_path)
    
    for model_name in models:
        print(f"\nMenggunakan model: {model_name}")
        model = YOLO(model_name)
        
        start_time = time.time()
        img_result, person_count, inference_time, detections = detect_people(img.copy(), model)
        latency_time = time.time() - start_time
        
        estimated_map = calculate_map(detections)
        
        print(f"Jumlah orang terdeteksi: {person_count}")
        print(f"Inference time: {inference_time:.4f} detik")
        print(f"Latency time: {latency_time:.4f} detik")
        print(f"Estimated mAP: {estimated_map:.4f}")
        
        cv2.imshow(f'Detection Result - {model_name}', img_result)
        cv2.waitKey(0)
    
    cv2.destroyAllWindows()

In [8]:
def process_video():
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename()
    
    if not file_path:
        print("Tidak ada file yang dipilih.")
        return
    
    cap = cv2.VideoCapture(file_path)
    
    for model_name in models:
        print(f"\nMenggunakan model: {model_name}")
        model = YOLO(model_name)
        
        frame_count = 0
        total_inference_time = 0
        total_person_count = 0
        all_detections = []
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            frame_count += 1
            
            frame_result, person_count, inference_time, detections = detect_people(frame, model)
            
            total_inference_time += inference_time
            total_person_count += person_count
            all_detections.extend(detections)
            
            cv2.putText(frame_result, f"Model: {model_name}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
            cv2.putText(frame_result, f"Frame: {frame_count}", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
            
            cv2.imshow('Video Detection', frame_result)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        avg_inference_time = total_inference_time / frame_count
        avg_person_count = total_person_count / frame_count
        estimated_map = calculate_map(all_detections)
        
        print(f"Rata-rata jumlah orang terdeteksi per frame: {avg_person_count:.2f}")
        print(f"Rata-rata inference time: {avg_inference_time:.4f} detik")
        print(f"Estimated mAP: {estimated_map:.4f}")
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

In [None]:
def main():
    while True:
        print("\nPilih mode:")
        print("1. Deteksi menggunakan webcam")
        print("2. Deteksi pada gambar")
        print("3. Deteksi pada video")
        print("4. Keluar")
        
        choice = input("Masukkan pilihan (1/2/3/4): ")
        
        if choice == '1':
            process_webcam()
        elif choice == '2':
            process_image()
        elif choice == '3':
            process_video()
        elif choice == '4':
            break
        else:
            print("Pilihan tidak valid. Silakan coba lagi.")

if __name__ == "__main__":
    main()


Pilih mode:
1. Deteksi menggunakan webcam
2. Deteksi pada gambar
3. Deteksi pada video
4. Keluar


Masukkan pilihan (1/2/3/4):  2



Menggunakan model: yolov8n.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:03<00:00, 1.71MB/s]



0: 480x640 2 persons, 61.4ms
Speed: 4.0ms preprocess, 61.4ms inference, 142.2ms postprocess per image at shape (1, 3, 480, 640)
Jumlah orang terdeteksi: 2
Inference time: 2.5646 detik
Latency time: 2.6032 detik
Estimated mAP: 1.0000

Menggunakan model: yolov8s.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:12<00:00, 1.85MB/s]



0: 480x640 2 persons, 1 clock, 56.1ms
Speed: 3.9ms preprocess, 56.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Jumlah orang terdeteksi: 2
Inference time: 0.2845 detik
Latency time: 0.2870 detik
Estimated mAP: 1.0000

Menggunakan model: yolov8m.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8m.pt to 'yolov8m.pt'...


100%|██████████| 49.7M/49.7M [00:09<00:00, 5.46MB/s]



0: 480x640 2 persons, 63.6ms
Speed: 4.0ms preprocess, 63.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
Jumlah orang terdeteksi: 2
Inference time: 0.4362 detik
Latency time: 0.4377 detik
Estimated mAP: 1.0000

Menggunakan model: yolov8l.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8l.pt to 'yolov8l.pt'...


100%|██████████| 83.7M/83.7M [00:26<00:00, 3.28MB/s]



0: 480x640 2 persons, 1 clock, 44.6ms
Speed: 6.5ms preprocess, 44.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Jumlah orang terdeteksi: 2
Inference time: 0.5957 detik
Latency time: 0.5967 detik
Estimated mAP: 1.0000

Menggunakan model: yolov8x.pt
Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8x.pt to 'yolov8x.pt'...


100%|██████████| 131M/131M [00:11<00:00, 11.5MB/s] 



0: 480x640 2 persons, 1 clock, 50.7ms
Speed: 2.0ms preprocess, 50.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)
Jumlah orang terdeteksi: 2
Inference time: 0.8188 detik
Latency time: 0.8198 detik
Estimated mAP: 1.0000

Pilih mode:
1. Deteksi menggunakan webcam
2. Deteksi pada gambar
3. Deteksi pada video
4. Keluar
