In [1]:
# Setup: install necessary libraries (run in a Python environment)
!pip install ultralytics timm torchvision scikit-learn tqdm




In [2]:
import os
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from ultralytics import YOLO  # YOLOv8 API
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt


In [3]:
class VisDroneDataset(Dataset):
    def __init__(self, images_dir, labels_dir, transform=None):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.transform = transform
        # List all image files
        self.image_files = [f for f in os.listdir(images_dir) if f.endswith('.jpg')]
    def __len__(self):
        return len(self.image_files)
    def __getitem__(self, idx):
        img_path = os.path.join(self.images_dir, self.image_files[idx])
        label_path = os.path.join(self.labels_dir, self.image_files[idx].replace('.jpg','.txt'))
        image = Image.open(img_path).convert('RGB')
        # Read YOLO labels
        boxes, labels = [], []
        if os.path.exists(label_path):
            with open(label_path) as f:
                for line in f:
                    cls, x, y, w, h = map(float, line.split())
                    labels.append(int(cls))
                    # Convert normalized centers to pixel coords (optional)
                    boxes.append([x, y, w, h])
        target = {"boxes": torch.tensor(boxes), "labels": torch.tensor(labels)}
        if self.transform:
            image = self.transform(image)
        return image, target

# Example usage (paths to be set appropriately)
# train_ds = VisDroneDataset('VisDrone/images/train', 'VisDrone/labels/train', transform=transforms.ToTensor())
# train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, pin_memory=True)


In [4]:
# Load pretrained YOLOv8n model
model = YOLO("yolov8n.pt")  # COCO-pretrained


In [5]:
#Dummy example: run on one image
image = r"C:\Users\koust\OneDrive\Pictures\maa.jpg"
results = model(image)  # returns detection results



image 1/1 C:\Users\koust\OneDrive\Pictures\maa.jpg: 640x512 1 person, 27.7ms
Speed: 1.9ms preprocess, 27.7ms inference, 75.1ms postprocess per image at shape (1, 3, 640, 512)


In [6]:
# Validate YOLOv8n on VisDrone (requires dataset YAML and GPU)
metrics = model.val(data="VisDrone.yaml", imgsz=640)
print("mAP50-95:", metrics.box.map, "mAP50:", metrics.box.map50)


Ultralytics 8.3.205  Python-3.13.5 torch-2.8.0+cu129 CUDA:0 (NVIDIA GeForce RTX 5070, 12226MiB)
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 1419.5357.0 MB/s, size: 120.0 KB)
[K[34m[1mval: [0mScanning C:\Users\koust\datasets\VisDrone\labels\val.cache... 548 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 548/548 1.3Mit/s 0.0s0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 35/35 11.7it/s 3.0s0.1s
                   all        548      38759     0.0639     0.0486     0.0306      0.014
                person        520       8844      0.274      0.179      0.143      0.061
               bicycle        482       5125     0.0722   0.000976     0.0471     0.0152
                   car        364       1287    0.00221      0.028    0.00146   0.000472
            motorcycle        515      14064    0.00762   0.000356     0.0305     0.0196
              airplane        421       1975     0.0666   0.000506

In [7]:
import timm

# Load pretrained classifiers
mobilenet = timm.create_model('mobilenetv3_small_100', pretrained=True)
vit = timm.create_model('vit_tiny_patch16_224', pretrained=True)
mobilenet.eval()
vit.eval()

# Example transform for classifier input (ImageNet preprocess)
clf_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])


In [8]:
from sklearn.metrics import accuracy_score, f1_score

y_true = [0, 1, 2, 1, 0]   # example ground truth classes
y_pred = [0, 2, 2, 1, 0]   # example model predictions
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
print(f"Accuracy: {acc:.2f}, F1-score: {f1:.2f}")


Accuracy: 0.80, F1-score: 0.78


In [9]:
def show_detections(image, boxes, labels):
    plt.figure(figsize=(8,6))
    plt.imshow(image)
    ax = plt.gca()
    for (x, y, w, h), cls in zip(boxes, labels):
        # Convert normalized xywh to pixel rectangle
        imgW, imgH = image.size
        x1 = (x - w/2) * imgW; y1 = (y - h/2) * imgH
        x2 = (x + w/2) * imgW; y2 = (y + h/2) * imgH
        rect = plt.Rectangle((x1,y1), x2-x1, y2-y1,
                             edgecolor='red', facecolor='none', linewidth=2)
        ax.add_patch(rect)
        ax.text(x1, y1-5, str(cls), color='red', fontsize=12)
    plt.axis('off')
    plt.show()


In [10]:
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mobilenet.to(device)

# Simulate timing on a dummy batch of images
dummy = torch.randn(4, 3, 224, 224).to(device)  # batch of 4
start = time.time()
_ = mobilenet(dummy)
torch.cuda.synchronize() if device.type=='cuda' else None
print(f"Inference time for batch of 4: {time.time()-start:.3f} sec")


Inference time for batch of 4: 0.037 sec


In [17]:
# Real-time YOLOv8 detection from webcam or video (single Jupyter cell)
# Paste and run in a notebook cell.

import time
import cv2
import torch
from ultralytics import YOLO
import numpy as np

# -------- User config --------
VIDEO_SOURCE = r"C:\Users\koust\Downloads\dhaka_traffic.mp4\dhaka_traffic.mp4"                 # 0 = default webcam, or 'path/to/video.mp4'
MODEL_WEIGHTS = "yolov8n.pt"     # pretrained COCO weights (ultralytics)
IMG_SZ = 640                     # inference size (reasonable tradeoff)
CONF_THRESH = 0.35               # detection confidence threshold
SHOW_LABELS = True               # draw class name + conf
write_output = False             # set True to save annotated output to file
output_path = "out_detected.mp4" # only used if write_output=True
# ------------------------------

# Device check
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Load YOLOv8n model (pretrained). This will download weights if not present.
model = YOLO(MODEL_WEIGHTS)
# Make sure model runs on chosen device
model.to(device)

# Open video capture
cap = cv2.VideoCapture(VIDEO_SOURCE)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video source: {VIDEO_SOURCE}")

# Prepare writer if saving output
writer = None
if write_output:
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25.0
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    print("Saving annotated output to:", output_path)

# Colors for drawing
np.random.seed(42)
palette = (np.random.randint(0,255, size=(80,3))).tolist()  # up to 80 COCO classes

# Helper to draw boxes on frame
def draw_boxes(frame, boxes, scores, classes, class_names):
    for (x1,y1,x2,y2), s, c in zip(boxes, scores, classes):
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        color = tuple(int(x) for x in palette[int(c) % len(palette)])
        cv2.rectangle(frame, (x1,y1), (x2,y2), color, 2)
        if SHOW_LABELS:
            label = f"{class_names[int(c)]} {s:.2f}"
            (w,h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
            cv2.rectangle(frame, (x1, y1 - 18), (x1 + w, y1), color, -1)
            cv2.putText(frame, label, (x1, y1 - 3), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1, cv2.LINE_AA)

# Get class names from model (if available)
try:
    class_names = model.model.names
except Exception:
    # fallback COCO names
    class_names = {i: str(i) for i in range(80)}

# Main loop: read frames, run inference, draw, display
frame_count = 0
start_time = time.time()
avg_fps = 0.0
# use warm-up frame(s) to prepare model, reduce variance
warmup_frames = 2

print("Starting inference. Press 'q' in the window to quit.")
while True:
    ret, frame = cap.read()
    if not ret:
        print("End of stream or cannot fetch frame.")
        break

    frame_count += 1
    # Convert BGR->RGB for model (ultralytics expects np.ndarray RGB or path)
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Inference: pass frame directly; set device via model
    t0 = time.time()
    # note: model() returns a Results object list; we use first element
    results = model.predict(source=[img_rgb], imgsz=IMG_SZ, device=device, conf=CONF_THRESH, verbose=False)
    # results is a list corresponding to the batch (here single image)
    res = results[0]
    # extract bounding boxes, confidences, classes
    # res.boxes.xyxy is tensor (N,4), res.boxes.conf (N,), res.boxes.cls (N,)
    if hasattr(res, "boxes") and len(res.boxes) > 0:
        boxes = res.boxes.xyxy.cpu().numpy()
        scores = res.boxes.conf.cpu().numpy()
        classes = res.boxes.cls.cpu().numpy()
    else:
        boxes = np.array([])
        scores = np.array([])
        classes = np.array([])

    # Draw detections on the original BGR frame
    if boxes.size:
        draw_boxes(frame, boxes, scores, classes, class_names)

    # Calculate FPS (use torch.cuda.synchronize for accurate GPU timing)
    if device == "cuda":
        torch.cuda.synchronize()
    t1 = time.time()
    fps = 1.0 / (t1 - t0) if (t1 - t0) > 0 else 0.0
    # running avg fps
    avg_fps = (avg_fps * (frame_count - 1) + fps) / frame_count

    # Overlay FPS and stats on frame
    info_text = f"FPS: {fps:.1f}  AvgFPS: {avg_fps:.1f}  Detections: {len(boxes)}"
    cv2.putText(frame, info_text, (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2, cv2.LINE_AA)

    # Show frame
    cv2.imshow("YOLOv8 Real-time", frame)

    # Write out if requested
    if write_output and writer is not None:
        writer.write(frame)

    # Exit on 'q'
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Cleanup
cap.release()
if writer is not None:
    writer.release()
cv2.destroyAllWindows()
total_time = time.time() - start_time
print(f"Processed {frame_count} frames in {total_time:.2f}s  Avg FPS: {frame_count/total_time:.2f}")


Device: cuda
Starting inference. Press 'q' in the window to quit.
End of stream or cannot fetch frame.
Processed 3794 frames in 60.38s  Avg FPS: 62.83
