## YOLO -v5

In [1]:
import cv2
import torch
import time

class YoloDetector():
    def __init__(self):
        # Using yolov5s for our purposes of object detection, you may use a larger model
        self.model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
        self.classes = self.model.names
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print('Using Device:', self.device)
    
    def detect_cars(self, frame):
        self.model.to(self.device)
        results = self.model(frame)
        predictions = results.xyxy[0]

        car_boxes = []
        for pred in predictions:
            if pred[4] > 0.6 and pred[5] == 2:  # Car class index
                car_boxes.append(pred[:4].cpu().numpy().astype(int))

        return car_boxes

# Initialize YOLO detector
yolo_detector = YoloDetector()

# Open video capture
cap = cv2.VideoCapture('highway.mp4')
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

# Define codec and create VideoWriter object
out = cv2.VideoWriter("yolo_v5_small_fps.avi",
                       cv2.VideoWriter_fourcc(*'mp4v'),
                       fps,
                       (w, h))

start_time = time.time()
frame_count = 0
fps_text = ""
while cap.isOpened():
    start_frame_time = time.time()
    
    ret, frame = cap.read()

    if not ret:
        break

    # Detect cars in the frame
    car_boxes = yolo_detector.detect_cars(frame)
    
    # Draw bounding boxes around cars
    for box in car_boxes:
        x1, y1, x2, y2 = box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    # Calculate FPS
    frame_count += 1
    if frame_count % 10 == 0:  
        end_time = time.time()
        fps = frame_count / (end_time - start_time)
        fps_text = f"FPS: {round(fps, 2)}"
        frame_count = 0
        start_time = time.time()

    # Display FPS on frame
    cv2.putText(frame, fps_text, (30, 300), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    cv2.imshow('Car Detection', frame)
    out.write(frame)
    
    end_frame_time = time.time()
    frame_processing_time = end_frame_time - start_frame_time
    wait_time_ms = max(int((1 / fps * 1000) - frame_processing_time * 1000), 1)
    
    if cv2.waitKey(wait_time_ms) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.destroyAllWindows()


Using cache found in C:\Users\opdar/.cache\torch\hub\ultralytics_yolov5_master
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
YOLOv5  2024-3-16 Python-3.11.3 torch-2.2.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Using Device: cpu


## YOLOv2-voc,YOLOv2,YOLOv2-tiny

In [21]:
import cv2
import numpy as np

# Load YOLOv2 model
net = cv2.dnn.readNet("yolov2-voc.weights", "yolov2-voc.cfg")
# net = cv2.dnn.readNet("yolov2.weights","yolov2.cfg")
# net = cv2.dnn.readNet("yolov2-tiny.weights","yolov2-tiny.cfg")


layer_names = net.getLayerNames()
output_layers = [layer_names[i] for i in range(len(layer_names))]
print("Output Layers:", output_layers)
def detect_objects(frame):
    height, width, channels = frame.shape

    # Preprocess the image
    blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)

    # Process detection outputs
    for out in outs:
        for detection in out:
            scores = detection[5:]
            if len(scores) > 0:
                class_id = np.argmax(scores)
                if class_id<20:
                    confidence = scores[class_id]
                    if confidence > 0.2:  # Class ID for cars in VOC dataset
                        # Object detected
                        # print(confidence,class_id)
                        center_x = int(detection[0] * width)
                        center_y = int(detection[1] * height)
                        w = int(detection[2] * width)
                        h = int(detection[3] * height)
                        x = int(center_x - w / 2)
                        y = int(center_y - h / 2)
                        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    return frame




# Real-time car tracking
cap = cv2.VideoCapture("highway.mp4")  # Replace "video.mp4" with your video file or camera index

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = detect_objects(frame)
    cv2.imshow("Car Tracking", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Output Layers: ['conv_0', 'bn_0', 'leaky_1', 'pool_1', 'conv_2', 'bn_2', 'leaky_3', 'pool_3', 'conv_4', 'bn_4', 'leaky_5', 'conv_5', 'bn_5', 'leaky_6', 'conv_6', 'bn_6', 'leaky_7', 'pool_7', 'conv_8', 'bn_8', 'leaky_9', 'conv_9', 'bn_9', 'leaky_10', 'conv_10', 'bn_10', 'leaky_11', 'pool_11', 'conv_12', 'bn_12', 'leaky_13', 'conv_13', 'bn_13', 'leaky_14', 'conv_14', 'bn_14', 'leaky_15', 'conv_15', 'bn_15', 'leaky_16', 'conv_16', 'bn_16', 'leaky_17', 'pool_17', 'conv_18', 'bn_18', 'leaky_19', 'conv_19', 'bn_19', 'leaky_20', 'conv_20', 'bn_20', 'leaky_21', 'conv_21', 'bn_21', 'leaky_22', 'conv_22', 'bn_22', 'leaky_23', 'conv_23', 'bn_23', 'leaky_24', 'conv_24', 'bn_24', 'leaky_25', 'identity_25', 'conv_26', 'bn_26', 'leaky_27', 'reorg_27', 'concat_28', 'conv_29', 'bn_29', 'leaky_30', 'conv_30', 'permute_31', 'detection_out']


## Faster RCNN

In [22]:
import torch
import torchvision
import cv2
import numpy as np
import time

# Load pre-trained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Function to perform object detection using the pre-trained model
def detect_objects(frame):
    
    frame_tensor = torch.from_numpy(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)

    # Perform inference
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Process predictions
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    # Draw bounding boxes on the frame
    for box, score, label in zip(boxes, scores, labels):
        # print(label)
        if score > 0.8 and label==3:  # Adjust confidence threshold as needed
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            # cv2.putText(frame, f'Class: {label}, Score: {score}', (x1, y1 - 10),
            #             cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return frame

# Open video capture
cap = cv2.VideoCapture('highway.mp4')

# Get frame rate of the video
fps = cap.get(cv2.CAP_PROP_FPS)

# frame Skipping
frame_skip = int(fps / 10)  
frame_count = 0
start_time = time.time()

while cap.isOpened():
    ret, frame = cap.read()

    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    # Perform object detection on the frame
    result_frame = detect_objects(frame)
    cv2.imshow('Object Detection', result_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

# Release video capture and close all windows
cap.release()
cv2.destroyAllWindows()


Total processing time: 21.93 seconds


In [15]:
import torch
import torchvision
import cv2
import numpy as np
import time
from deep_sort_realtime.deepsort_tracker import DeepSort

#Initialise the object tracker class
object_tracker = DeepSort()
# Load pre-trained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Function to perform object detection using the pre-trained model
def detect_objects(frame):
    frame_tensor = torch.from_numpy(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)

    # Perform inference
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Process predictions
    boxes = predictions[0]['boxes'].numpy()
    scores = predictions[0]['scores'].numpy()
    labels = predictions[0]['labels'].numpy()

    return boxes, scores, labels

# Open video capture
cap = cv2.VideoCapture('highway.mp4')

# Get frame rate of the video
fps = cap.get(cv2.CAP_PROP_FPS)

# Process every nth frame to speed up processing
frame_skip = int(fps / 10)  # Adjust as needed

frame_count = 0
start_time = time.time()

while cap.isOpened():
    ret, img = cap.read()

    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    start = time.perf_counter()

    # Perform object detection on the frame
    boxes, scores, labels = detect_objects(img)

    # Overlay detected bounding boxes on the frame
    # for box, score, label in zip(boxes, scores, labels):
    #     if score > 0.5 and label == 3:  # Adjust confidence threshold and class as needed
    #         x1, y1, x2, y2 = map(int, box)
    #         cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)

    # Object tracking part
    if img.any() is not None:
        detections = [(box, score, label) for box, score, label in zip(boxes, scores, labels) if score > 0.9 and label == 3]

        # Convert bounding box data to the format expected by the object tracker
        bounding_boxes = [([box[0], box[1], box[2] - box[0], box[3] - box[1]], score, label) for box, score, label in detections]
        
        # Update tracks using the formatted bounding box data
        tracks = object_tracker.update_tracks(bounding_boxes, frame=img)
        d = set()
        for track in tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            ltrb = track.to_ltrb()

            bbox = ltrb

            cv2.rectangle(img, (int(bbox[0]),int(bbox[1])),(int(bbox[2]),int(bbox[3])),(0,255,0),2)
            if track not in d:
                d.add(track_id)
            cv2.putText(img, str(len(d)), (int(bbox[0]),int(bbox[1]-10)), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,255), 2)

    end = time.perf_counter()
    totalTime = end - start
    fps = 1 / totalTime

    cv2.putText(img, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
    img = cv2.resize(img,(1800,900))
    cv2.imshow('img', img)

    k = cv2.waitKey(1)
    if k & 0xFF == ord('q'):
        break

end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

# Release video capture and close all windows
cap.release()
cv2.destroyAllWindows()


Total processing time: 19.77 seconds


## Faster RCCN  and DeepSORT

In [2]:
import torch
import torchvision
import cv2
import numpy as np
import time
from deep_sort_realtime.deepsort_tracker import DeepSort

# Initialise the object tracker class
object_tracker = DeepSort()
# Load pre-trained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Define the line near the bottom of the frame for counting cars
line_y = 800  

# Function to check if a point is above a line
def above_line(point, line_y):
    return point[1] < line_y
    
# Function to perform object detection using the pre-trained model
def detect_objects(frame):
    # Convert frame to torch tensor
    frame_tensor = torch.from_numpy(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)

    # Perform inference
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Process predictions
    boxes = predictions[0]['boxes'].numpy()
    scores = predictions[0]['scores'].numpy()
    labels = predictions[0]['labels'].numpy()

    return boxes, scores, labels

# Open video capture
cap = cv2.VideoCapture('highway.mp4')

# Get frame rate of the video
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

# Process every nth frame to speed up processing
frame_skip = int(fps / 20)  # Adjust as needed

# Define codec and create VideoWriter object
out = cv2.VideoWriter("faster_rcnn_deep_sort.avi",
                       cv2.VideoWriter_fourcc(*'mp4v'),
                       fps,
                       (w, h))
frame_count = 0
start_time = time.time()
previous_count = 0
current_count = 0
d = set()
d1 = {}
sort_paths = {}
j = 1
while cap.isOpened():
    ret, img = cap.read()

    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    start = time.perf_counter()
    cv2.line(img,(70,750),(1100,750),(255,255,255),2)

    # Perform object detection on the frame
    boxes, scores, labels = detect_objects(img)

    # Overlay detected bounding boxes on the frame
    if img.any() is not None:
        detections = [(box, score, label) for box, score, label in zip(boxes, scores, labels) if score > 0.85 and label == 3]

        # Convert bounding box data to the format expected by the object tracker
        bounding_boxes = [([box[0], box[1], box[2] - box[0], box[3] - box[1]], score, label) for box, score, label in detections]
        
        # Update tracks using the formatted bounding box data
        tracks = object_tracker.update_tracks(bounding_boxes, frame=img)
        for track in tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            ltrb = track.to_ltrb()
            bbox = ltrb
            bottom_center = ((bbox[0] + bbox[2]) // 2, bbox[3])
            if not above_line(bottom_center, line_y) and track_id not in d:
                d.add(track_id)
                current_count += 1
            if track_id not in d1:
                d1[track_id] = str(j)
                j+=1
            cv2.rectangle(img, (int(bbox[0]),int(bbox[1])),(int(bbox[2]),int(bbox[3])),(0,255,0),2)
            cv2.putText(img, str(track_id), (int(bbox[0]),int(bbox[1]-10)), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
    cv2.putText(img, f'Count: {current_count}', (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 255), 2)

    end = time.perf_counter()
    totalTime = end - start
    fps = 1 / totalTime

    cv2.putText(img, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
    # img = cv2.resize(img,(1800,900))
    cv2.imshow('img', img)

    # Write the frame to the output video
    out.write(img)

    k = cv2.waitKey(1)
    if k & 0xFF == ord('q'):
        break

end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

# Release video capture and close all windows
cap.release()
out.release()
cv2.destroyAllWindows()


Total processing time: 67.92 seconds


In [13]:
print(d)

{'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '9': '6', '10': '7', '11': '8', '14': '9', '15': '10', '18': '11', '20': '12', '22': '13'}


## Faster RCNN and SORT

In [4]:
import torch
import torchvision
import cv2
import numpy as np
import time
from sort import *

# Initialize the SORT tracker
object_tracker = Sort()

# Load pre-trained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Define the line near the bottom of the frame for counting cars
line_y = 800  

# Function to check if a point is above a line
def above_line(point, line_y):
    return point[1] < line_y

# Function to perform object detection using the pre-trained model
def detect_objects(frame):
    # Convert frame to torch tensor
    frame_tensor = torch.from_numpy(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)

    # Perform inference
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Process predictions
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    return boxes, scores, labels

# Open video capture
cap = cv2.VideoCapture('highway.mp4')

# Get frame rate of the video
fps = cap.get(cv2.CAP_PROP_FPS)

# Process every nth frame to speed up processing
frame_skip = int(fps / 20)  # Adjust as needed

frame_count = 0
start_time = time.time()

# Initialize variables for counting cars
previous_count = 0
current_count = 0
j = 1
d = set()
d1 = {}
sort_paths = {}

while cap.isOpened():
    ret, img = cap.read()

    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    start = time.perf_counter()
    cv2.line(img,(80,800),(1100,800),(255,255,255),2)
    # Perform object detection on the frame
    boxes, scores, labels = detect_objects(img)

    # Object tracking part
    if img.any() is not None:
        detections = [(box, score, label) for box, score, label in zip(boxes, scores, labels) if score > 0.75 and label == 3]

        # Convert bounding box data to the format expected by the object tracker
        bounding_boxes = np.array([[box[0], box[1], box[2], box[3], score] for box, score, label in detections])
        
        # Update tracks using the formatted bounding box data
        tracks = object_tracker.update(bounding_boxes)

        # Count cars passing through the line
        for track in tracks:
            bbox = track.astype(int)
            
            # Check if the bottom center of the bounding box crosses the line
            bottom_center = ((bbox[0] + bbox[2]) // 2, bbox[3])
            if not above_line(bottom_center, line_y) and track[4] not in d:
                d.add(track[4])
                current_count += 1
            if track[4] not in d1:
                d1[track[4]] = str(j)
                j+=1
            cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
            cv2.putText(img, d1[track[4]], (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX,  0.7,(0, 0, 255), 2)

        # Update count
        # if current_count > previous_count:
        #     cars_passed = current_count - previous_count
        #     print(f"{cars_passed} car(s) passed through the line.")
        #     previous_count = current_count

        # Display count on the frame
        cv2.putText(img, f'Count: {current_count}', (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 0), 2)

    end = time.perf_counter()
    totalTime = end - start
    fps = 1 / totalTime

    cv2.putText(img, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,255), 2)
    img = cv2.resize(img,(1800,900))
    cv2.imshow('img', img)

    k = cv2.waitKey(1)
    if k & 0xFF == ord('q'):
        break

end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

# Release video capture and close all windows
cap.release()
cv2.destroyAllWindows()


Total processing time: 843.53 seconds


In [2]:
print(current_count)

0


In [2]:
import torch
import torchvision
import cv2
import numpy as np
import time
from sort import *

# Initialize the SORT tracker
object_tracker = Sort()

# Load pre-trained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Function to perform object detection using the pre-trained model
def detect_objects(frame):
    # Resize frame to reduce computation
    # frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)

    # Convert frame to torch tensor
    frame_tensor = torch.from_numpy(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)

    # Perform inference
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Process predictions
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    return boxes, scores, labels

# Open video capture
cap = cv2.VideoCapture('highway.mp4')

# Get frame rate of the video
fps = cap.get(cv2.CAP_PROP_FPS)

# Process every nth frame to speed up processing
frame_skip = int(fps / 10)  # Adjust as needed

frame_count = 0
start_time = time.time()

while cap.isOpened():
    ret, img = cap.read()

    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    start = time.perf_counter()

    # Perform object detection on the frame
    boxes, scores, labels = detect_objects(img)

    # Overlay detected bounding boxes on the frame
    # for box, score, label in zip(boxes, scores, labels):
    #     if score > 0.5 and label == 3:  # Adjust confidence threshold and class as needed
    #         x1, y1, x2, y2 = map(int, box)
    #         cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)

    # Object tracking part
    if img.any() is not None:
        detections = [(box, score, label) for box, score, label in zip(boxes, scores, labels) if score > 0.7 and label == 3]

        # Convert bounding box data to the format expected by the object tracker
        bounding_boxes = np.array([[box[0], box[1], box[2], box[3], score] for box, score, label in detections])
        
        # Update tracks using the formatted bounding box data
        tracks = object_tracker.update(bounding_boxes)
        for track in tracks:
            # print(track)
            bbox = track.astype(int)

            cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
            cv2.putText(img, str(int(track[4])), (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX,  0.7,(0, 0, 255), 2)

    end = time.perf_counter()
    totalTime = end - start
    fps = 1 / totalTime

    cv2.putText(img, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
    img = cv2.resize(img,(1800,900))
    cv2.imshow('img', img)

    k = cv2.waitKey(1)
    if k & 0xFF == ord('q'):
        break

end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

# Release video capture and close all windows
cap.release()
cv2.destroyAllWindows()


Total processing time: 14.26 seconds


## SORT vs DeepSORT

In [4]:
import torch
import torchvision
import cv2
import numpy as np
import time
from deep_sort_realtime.deepsort_tracker import DeepSort
from sort import Sort

# Initialize the SORT tracker
sort_tracker = Sort()

# Initialize the Deep SORT tracker
deepsort_tracker = DeepSort()

# Load pre-trained Faster R-CNN model from torchvision
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Function to perform object detection using the pre-trained model
def detect_objects(frame):
    # Resize frame to reduce computation
    # frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5)

    # Convert frame to torch tensor
    frame_tensor = torch.from_numpy(frame / 255.0).permute(2, 0, 1).float().unsqueeze(0)

    # Perform inference
    with torch.no_grad():
        predictions = model(frame_tensor)

    # Process predictions
    boxes = predictions[0]['boxes'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()

    return boxes, scores, labels

# Open video capture
cap = cv2.VideoCapture('highway.mp4')

# Get frame rate of the video
fps = cap.get(cv2.CAP_PROP_FPS)

# Process every nth frame to speed up processing
frame_skip = int(fps / 10)  # Adjust as needed

frame_count = 0
start_time = time.time()

# Dictionary to store object paths
sort_paths = {}
deepsort_paths = {}

while cap.isOpened():
    ret, img = cap.read()

    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue

    start = time.perf_counter()

    # Perform object detection on the frame
    boxes, scores, labels = detect_objects(img)
    if img.any() is not None:
        deepsort_detections = [(box, score, label) for box, score, label in zip(boxes, scores, labels) if score > 0.8 and label == 3]

        # Convert bounding box data to the format expected by the Deep SORT tracker
        deepsort_bounding_boxes = [([box[0], box[1], box[2] - box[0], box[3] - box[1]], score, label) for box, score, label in deepsort_detections]
        
        # Update tracks using the formatted bounding box data
        deepsort_tracks = deepsort_tracker.update_tracks(deepsort_bounding_boxes, frame=img)
        
        # Update object paths
        for track in deepsort_tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            if track_id not in deepsort_paths:
                deepsort_paths[track_id] = []
            deepsort_paths[track_id].append((int((track.to_tlbr()[0] + track.to_tlbr()[2]) / 2), int((track.to_tlbr()[1] + track.to_tlbr()[3]) / 2)))

            bbox = track.to_tlbr().astype(int)
            cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
            cv2.circle(img, center=(int(bbox[0]+bbox[2])//2, int(bbox[1]+bbox[3])//2), radius=3, color=(0, 0, 255), thickness=3)

            cv2.putText(img, str(track_id), (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX,  0.7,(255, 0, 255), 2)

        # Draw object paths
        for track_id, path in deepsort_paths.items():
            for i in range(1, len(path)):
                cv2.line(img, path[i-1], path[i], (255, 0, 0), 2)

    # Object tracking using SORT tracker
    if img.any() is not None:
        sort_detections = [(box, score, label) for box, score, label in zip(boxes, scores, labels) if score > 0.8 and label == 3]

        # Convert bounding box data to the format expected by the SORT tracker
        sort_bounding_boxes = np.array([[box[0], box[1], box[2], box[3], score] for box, score, label in sort_detections])
        
        # Update tracks using the formatted bounding box data
        sort_tracks = sort_tracker.update(sort_bounding_boxes)
        
        # Update object paths
        for track in sort_tracks:
            track_id = int(track[4])
            if track_id not in sort_paths:
                sort_paths[track_id] = []
            sort_paths[track_id].append((int((track[0] + track[2]) / 2), int((track[1] + track[3]) / 2)))

            bbox = track.astype(int)
            cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
            cv2.putText(img, str(track_id), (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX,  0.7,(0, 0, 255), 2)

        # Draw object paths
        for track_id, path in sort_paths.items():
            for i in range(1, len(path)):
                cv2.line(img, path[i-1], path[i], (0, 255, 0), 2)

    # Object tracking using Deep SORT tracker
    
    end = time.perf_counter()
    totalTime = end - start
    fps = 1 / totalTime

    cv2.putText(img, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
    img = cv2.resize(img,(1800,900))
    cv2.imshow('img', img)

    k = cv2.waitKey(1)
    if k & 0xFF == ord('q'):
        break

end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

# Release video capture and close all windows
cap.release()
cv2.destroyAllWindows()


Total processing time: 148.86 seconds


## YOLOV5 + DeepSORT

In [13]:
# !pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

In [2]:
#Necessary imports1
import cv2
import numpy as np
import sys
import glob

import time
import torch

In [3]:
# Creating a class for object detection which plots boxes and scores frames in addition to detecting an 
# object

class YoloDetector():

    def __init__(self):
        #Using yolov5s for our purposes of object detection, you may use a larger model
        self.model = torch.hub.load('ultralytics/yolov5', 'yolov5m', pretrained = True)
        self.classes = self.model.names
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print('Using Device: ', self.device)
    
    def score_frame(self, frame):
        self.model.to(self.device)
        downscale_factor = 2
        width = int(frame.shape[1] / downscale_factor)
        height = int(frame.shape[0] / downscale_factor)
        frame = cv2.resize(frame, (width, height))

        results = self.model(frame)

        labels, cord = results.xyxyn[0][:, -1], results.xyxyn[0][:, :-1]

        return labels, cord
    
    def class_to_label(self, x):
        return self.classes[int(x)]
    
    def plot_boxes(self, results, frame, height, width, confidence=0.7):
        self.model.to(self.device)
        results = self.model(frame)
        predictions = results.xyxy[0]

        detections = []
        for pred in predictions:
            if pred[4] >= confidence and pred[5] == 2:  # Car class index
                x1, y1, x2, y2 = pred[:4].cpu().numpy().astype(int)
                x_center = x1 + (x2 - x1) // 2
                y_center = y1 + (y2 - y1) // 2
                tlwh = np.asarray([x1, y1, x2 - x1, y2 - y1], dtype=np.float32)
                detections.append((tlwh, float(pred[4].item()), 'car'))
        
        return frame, detections
        

In [4]:
# import cv2
# import numpy as np

# class YoloDetector():

#     # def __init__(self, model_path='yolov2-voc.weights', config_path='yolov2-voc.cfg', classes_path='voc.names'):
#     def __init__(self, model_path='yolov2.weights', config_path='yolov2.cfg', classes_path='coco.names'):

#         # Load YOLOv2 model
#         self.net = cv2.dnn.readNet(model_path, config_path)
#         # Load class names
#         with open(classes_path, 'r') as f:
#             self.classes = f.read().strip().split('\n')
#         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
#         print('Using Device:', self.device)
#         self.layer_names = self.net.getLayerNames()
#         self.output_layers = [self.layer_names[i] for i in range(len(self.layer_names))]

#     def score_frame(self, frame):
#         self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
#         self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

#         blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
#         self.net.setInput(blob)
#         outs = self.net.forward(self.output_layers)

#         h, w = frame.shape[:2]
#         detections = []
#         detected_objects = []
#         for out in outs:
#             for detection in out:
#                 scores = detection[5:]
#                 if len(scores) > 0:
#                     class_id = np.argmax(scores)
#                     if class_id < len(self.classes):
#                         confidence = scores[class_id]
#                         if confidence.any() > 0.2:
#                             center_x = int(detection[0] * w)
#                             center_y = int(detection[1] * h)
#                             width = int(detection[2] * w)
#                             height = int(detection[3] * h)
#                             x1 = int(center_x - width / 2)
#                             y1 = int(center_y - height / 2)
#                             x2 = int(x1 + width)
#                             y2 = int(y1 + height)
#                             detections.append(([x1, y1, x2, y2], confidence, class_id))
#                             detected_objects.append(class_id)

#         return detected_objects, detections

#     def class_to_label(self, x):
#         return self.classes[int(x)]

#     def plot_boxes(self, results, frame, height, width, confidence=0.3):

#         labels, cord = results
#         dt1 = [([10,10,1,1], 0.1, 'car')]

#         n = len(labels)
#         x_shape, y_shape = width, height

#         for i in range(n):
#             row = cord[i]
#             print(row)
#             if row[1]>=confidence:
#                 x1, y1, x2, y2 = row[0][0], row[0][1], row[0][2], row[0][3]
#                 # print(x1,y1,x2,y2)
#                 #In this demonstration, we will only be detecting persons. You can add classes of your choice
#                 if self.class_to_label(labels[i]) == 'car':

#                     x_center = x1 + (x2-x1)
#                     y_center = y1 + ((y2-y1) / 2)

#                     tlwh = np.asarray([x1, y1, int(x2-x1), int(y2-y1)], dtype = np.float32)
#                     confidence = float(row[1].item())
#                     feature = 'car'

#                     dt1.append(([x1, y1, int(x2-x1), int(y2-y1)], row[1].item(), 'car'))
#         return frame,dt1
# # Example usage:
# detector = YoloDetector()
# img = cv2.imread('traffic.jpg')
# detections = detector.score_frame(img)
# detected_frame,dt = detector.plot_boxes(detections,img,height=img.shape[0], width=img.shape[1], confidence=0.5)
# cv2.imshow('Detected Frame', detected_frame)
# cv2.waitKey(0)
# cv2.destroyAllWindows()


In [5]:
# Setting input video to webcam
# To use your own pre-downloaded videos, write the file path instead of 0
cap = cv2.VideoCapture("highway.mp4")

# Setting resolution for webcam
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

#Initializing the detection class
detector =  YoloDetector()

Using cache found in C:\Users\opdar/.cache\torch\hub\ultralytics_yolov5_master
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
YOLOv5  2024-3-16 Python-3.11.3 torch-2.2.1+cpu CPU

Fusing layers... 
YOLOv5m summary: 290 layers, 21172173 parameters, 0 gradients, 48.9 GFLOPs
Adding AutoShape... 


Using Device:  cpu


In [6]:
# !pip install deep-sort-realtime

In [7]:
from deep_sort_realtime.deepsort_tracker import DeepSort

#Initialise the object tracker class
object_tracker = DeepSort()

In [8]:
import time

line_y = 800  

# Function to check if a point is above a line
def above_line(point, line_y):
    return point[1] < line_y
    
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

# Process every nth frame to speed up processing
frame_skip = int(fps / 20)  # Adjust as needed

# Define codec and create VideoWriter object
out = cv2.VideoWriter("faster_rcnn_deep_sort.avi",
                       cv2.VideoWriter_fourcc(*'mp4v'),
                       fps,
                       (w, h))
frame_count = 0
d = set()
d1 = {}
current_count = 0
j = 1
while cap.isOpened():
    success, img = cap.read()
    if success and img.any()!=None:
        start = time.perf_counter()
    
        results = detector.score_frame(img)
        img,detections = detector.plot_boxes(results, img, height=img.shape[0], width=img.shape[1], confidence=0.85)
        # for box,score,label in detections:
        #     # if score > 0.3:  # Adjust confidence threshold and class as needed
        #     cv2.rectangle(img, (int(box[0]),int(box[1])),(int(box[2]+box[0]),int(box[3]+box[1])),(0,255,255),2)
                # print("here",box)
        cv2.line(img,(70,750),(1100,750),(255,255,255),2)

        if img.any() is not None:
            tracks = object_tracker.update_tracks(detections, frame=img) 
            # NOTE: Bounding box expects to be a list of detections, each in tuples of ([left, top, w, h], confidence, detection class)
            
            for track in tracks:
                if not track.is_confirmed():
                    continue
                track_id = track.track_id
                ltrb = track.to_ltrb()
        
                bbox = ltrb
                # print("bbbox",bbox)
                bottom_center = ((bbox[0] + bbox[2]) // 2, bbox[3])

                if not above_line(bottom_center, line_y) and track_id not in d:
                    d.add(track_id)
                    current_count += 1
                if track_id not in d1:
                    d1[track_id]=str(j)
                    j+=1
                cv2.rectangle(img, (int(bbox[0]),int(bbox[1])),(int(bbox[2]),int(bbox[3])),(0,0,255),2)
                cv2.putText(img, "ID: " + d1[track_id], (int(bbox[0]),int(bbox[1]-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 2)
            cv2.putText(img, f'Count: {current_count}', (20, 120), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 255), 2)

        end = time.perf_counter()
        totalTime = end-start
        fps = 1/totalTime
    
        cv2.putText(img, f'FPS: {int(fps)}', (20,70), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
        img = cv2.resize(img,(1800,900))
        cv2.imshow('img', img)
        k = cv2.waitKey(1)
        if k & 0xFF == ord('q'):
            break

cap.release()

cv2.destroyAllWindows()

## YOLOv8

In [3]:
import cv2
import numpy as np
from ultralytics import YOLO

from ultralytics.utils.checks import check_imshow
from ultralytics.utils.plotting import Annotator, colors

from collections import defaultdict

track_history = defaultdict(lambda: [])
model = YOLO("yolov8n.pt")
names = model.model.names

video_path = "highway.mp4"
cap = cv2.VideoCapture(video_path)
assert cap.isOpened(), "Error reading video file"

w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

while cap.isOpened():
    success, frame = cap.read()
    if success:
        results = model.track(frame, persist=True, verbose=False)
        boxes = results[0].boxes.xyxy.cpu()

        if results[0].boxes.id is not None:
            # print(results)
            # Extract prediction results
            clss = results[0].boxes.cls.cpu().tolist()
            track_ids = results[0].boxes.id.int().cpu().tolist()
            confs = results[0].boxes.conf.float().cpu().tolist()

            # Annotator Init
            annotator = Annotator(frame, line_width=2)

            for box, cls, track_id in zip(boxes, clss, track_ids):
                print(cls)
                if cls==2.0:
                    annotator.box_label(box, color=colors(int(cls), True), label=names[int(cls)])
                    # Store tracking history
                    track = track_history[track_id]
                    track.append((int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)))
                    if len(track) > 30:
                        track.pop(0)
    
                    # Plot tracks
                    points = np.array(track, dtype=np.int32).reshape((-1, 1, 2))
                    cv2.circle(frame, (track[-1]), 7, colors(int(cls), True), -1)
                    cv2.polylines(frame, [points], isClosed=False, color=colors(int(cls), True), thickness=2)
        frame = cv2.resize(frame,(1800,900))

        cv2.imshow("Object Tracking", frame)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        break

cap.release()
cv2.destroyAllWindows()


5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
5.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0


In [12]:
import cv2
import numpy as np
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator, colors
from collections import defaultdict
import time

track_history = defaultdict(lambda: [])
model = YOLO("yolov8n.pt")
names = model.model.names

video_path = "highway.mp4"
cap = cv2.VideoCapture(video_path)
assert cap.isOpened(), "Error reading video file"

w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))

frame_count = 0
start_time = time.time()
fps_text = ""

while cap.isOpened():
    success, frame = cap.read()
    if success:
        frame_count += 1
        results = model.track(frame, persist=True, verbose=False)
        boxes = results[0].boxes.xyxy.cpu()

        if results[0].boxes.id is not None:
            clss = results[0].boxes.cls.cpu().tolist()
            track_ids = results[0].boxes.id.int().cpu().tolist()
            confs = results[0].boxes.conf.float().cpu().tolist()

            annotator = Annotator(frame, line_width=2)

            for box, cls, track_id, conf in zip(boxes, clss, track_ids, confs):
                if cls == 2.0 and conf>0.5:
                    annotator.box_label(box, color=colors(int(cls), True), label=f"{names[int(cls)]} ({conf:.2f})")
                    track = track_history[track_id]
                    track.append((int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)))
                    if len(track) > 30:
                        track.pop(0)

                    points = np.array(track, dtype=np.int32).reshape((-1, 1, 2))
                    cv2.circle(frame, (track[-1]), 7, colors(int(cls), True), -1)
                    cv2.polylines(frame, [points], isClosed=False, color=colors(int(cls), True), thickness=2)
        
        # Calculate FPS
        if frame_count % 10 == 0:  # Update FPS every 10 frames
            end_time = time.time()
            fps = frame_count / (end_time - start_time)
            fps_text = f"FPS: {round(fps, 2)}"
            frame_count = 0
            start_time = time.time()

        # Display FPS on frame
        cv2.putText(frame, fps_text, (30, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        frame = cv2.resize(frame, (1800, 900))
        cv2.imshow("Object Tracking", frame)
        
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        break

cap.release()
cv2.destroyAllWindows()
