<div align="center">
  <a href="http://www.sharif.edu/">
    <img src="https://cdn.freebiesupply.com/logos/large/2x/sharif-logo-png-transparent.png" alt="SUT Logo" width="140">
  </a>
  
  # Sharif University of Technology
  ### Electrical Engineering Department

  ## Signals and Systems
  #### *Final Project - Spring 2025*
</div>

---

<div align="center">
  <h1>
    <b>Object Tracker</b>
  </h1>
  <p>
    An object tracking system using YOLO for detection and various algorithms (KCF, CSRT, MOSSE) for tracking.
  </p>
</div>

<br>

| Professor                  |
| :-------------------------: |
| Dr. Mohammad Mehdi Mojahedian |

<br>

| Contributors              |
| :-----------------------: |
| **Amirreza Mousavi** |
| **Mahdi Falahi** |
| **Zahra Miladipour** |


## 1: Preparing The Materials


In [221]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from ultralytics import YOLO
import time

1.1 : Calculating HOG ( return the hog of the image)
 

In [222]:
def hog_scaling (image):
    img = cv2.cvtColor(image ,cv2.COLOR_BGR2GRAY)
    filter = cv2.HOGDescriptor((64, 64), (16, 16), (8, 8), (8, 8), 9)
    resized_image = cv2.resize(img, (64, 64))
    result = filter.compute(resized_image)
    return result if result is not None else np.zeros((hog_descriptor.getDescriptorSize(1764),))


In [223]:
def hog_channel(image):
    img = cv2.cvtColor(image , cv2.COLOR_BGR2GRAY)
    win_size = (image.shape[1], image.shape[0])
    filter = cv2.HOGDescriptor(win_size , (16 ,16 ) , (8,8) , (8,8) , 9)
    result = filter.compute(img)
    height =  (win_size[1] - 16) // 8 + 1    # 8 is the block strid(x) we can change that consider the trade off of time _ accuracy
    width = (win_size[0] - 16) // 8 + 1   # 8 is the block strid(y) we can change that consider the trade off of time _ accuracy
    features_per_block = 2 * 2 * 9
    hog_features = result.reshape((height, width, features_per_block))
    return cv2.resize(hog_features, (win_size[0] // 8, win_size[1] // 8))


1.2 : Checking The Scale (return the scale of the image in the current frame)

In [224]:
def scaled_check(frame, pos, base_size, scale_factors, scale_model_A, scale_model_B, lambda_trust=0.01):
    scale_features = []
    for scale in scale_factors:
        w_s, h_s = int(base_size[0] * scale), int(base_size[1] * scale)
        x_s, y_s = int(pos[0] - w_s / 2), int(pos[1] - h_s / 2)
        patch_s = frame[y_s:y_s+h_s, x_s:x_s+w_s]
        if patch_s.shape[0] < 16 or patch_s.shape[1] < 16:
            scale_features.append(np.zeros_like(scale_features[0]) if len(scale_features) > 0 else np.zeros((1764,)))
            continue
        resized = cv2.resize(patch_s, (64, 64))
        scale_features.append(hog_scaling(resized))
    
    SF = np.fft.fft(np.array(scale_features), axis=0)
    scale_H = scale_model_A / (scale_model_B[:, np.newaxis] + lambda_trust)
    response_f = np.sum(np.conj(scale_H) * SF, axis=1)
    response = np.real(np.fft.ifft(response_f))
    
    best_scale_idx = np.argmax(response)
    return scale_factors[best_scale_idx]

1.3 : Prediction The Next Frame's Center ( Kalman Filter )

In [225]:
def kalman_prediction( F , X_k_1 , P_k_1 , Q_k):
    x_k = np.dot(F , X_k_1)
    p_k = np.dot( F , np.dot(P_k_1 , F.T)) + Q_k
    return x_k , p_k

In [226]:
def kalman_updating(x_k , p_k , H_k , z_k , R_k):
    k_1 = np.dot(np.dot(H_k , p_k) , H_k.T) + R_k
    k_2 = np.dot(p_k , H_k.T)
    K = np.dot(k_2 , np.linalg.inv(k_1))
    P_k_new = p_k - np.dot(np.dot(K , H_k) , p_k)
    x_k_new = x_k + np.dot(K , (z_k - np.dot(H_k , x_k)))
    return x_k_new , P_k_new

1.4 : Updating Method

In [227]:
def filter_updating(H_new , H_old , alpha):
    result = alpha * H_new + (1-alpha) * H_old
    return result

1.5 : Finding The Channels

In [228]:
def extract_channels(image):
    hog_features = hog_channel(image)
    colors = np.array([
        [0.00, 0.00, 0.00], [45.37, -4.33, -33.43], [43.08, 17.51, 37.53],
        [53.59, 0.00, 0.00], [47.31, -45.33, 41.35], [65.75, 71.45, 63.32],
        [76.08, 22.25, -21.46], [32.30, 79.19, -107.86], [52.23, 75.43, 37.36],
        [100.00, 0.00, 0.00], [92.13, -16.53, 93.35]
    ], dtype=np.float32)
    
    image_lab = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
    pixels = image_lab.reshape(-1, 3).astype(np.float32)
    distances = np.sum((pixels[:, np.newaxis, :] - colors[np.newaxis, :, :]) ** 2, axis=2)
    closest_color_indices = np.argmin(distances, axis=1)
    
    cn_features_flat = np.zeros((pixels.shape[0], colors.shape[0]), dtype=np.float32)
    cn_features_flat[np.arange(pixels.shape[0]), closest_color_indices] = 1
    cn_features = cn_features_flat.reshape(image.shape[0], image.shape[1], -1)

    hog_resized = cv2.resize(hog_features, (image.shape[1], image.shape[0]))
    return np.dstack((hog_resized, cn_features))

1.6 : Teaching The Filters

In [229]:
def teaching ( f , g , lambda_trust):
    X = np.fft.fft2(f , axes = (0 ,1 ))
    G = np.fft.fft2(g)
    G1 = np.expand_dims(G , axis = 2)
    num = np.conj(X) * G1
    denom = np.sum(np.conj(X) * X , axis =2) + lambda_trust
    return num, denom

## 2 : Main Detector

2.1 : Uploading Video

In [230]:
cap = cv2.VideoCapture('person1.mp4')
model = YOLO('yolo11n.pt')
if not cap.isOpened():
    print("wrong video")

2.1 : The Main Part

In [231]:
tracking = False
    
model_A, model_B = None, None
scale_model_A, scale_model_B = None, None
current_pos, current_size = (0, 0), (0, 0)
fixed_roi_size = (64, 128)

dt = 1/30.0 
F = np.array([[1, dt], [0, 1]])
H_kalman = np.array([[1, 0]])
Q = np.array([[(dt**4)/4, (dt**3)/2], [(dt**3)/2, dt**2]]) * 1.0
R = np.array([[25.0]])
kf_x_state, kf_y_state = np.zeros((2, 1)), np.zeros((2, 1))
kf_x_p, kf_y_p = np.eye(2) * 500, np.eye(2) * 500

scale_factors = np.array([0.95, 0.98, 1.0, 1.02, 1.05])

last_time = 0
fps_start_time = 0
fps_frame_count = 0
fps = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    if fps_start_time == 0:
        fps_start_time = time.time()
    current_time = time.time()
    dt = current_time - last_time if last_time > 0 else 1/30.0
    last_time = current_time
    F[0, 1] = dt
    if not tracking:
        results = model(frame)[0]
        for box in results.boxes:
            class_name = model.names[int(box.cls[0].item())]
            conf = box.conf[0].item()
            if class_name == "person" and conf > 0.7:
                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
                w, h = x2 - x1, y2 - y1
                
                current_pos = (x1 + w/2, y1 + h/2)
                current_size = (w, h)
                
                kf_x_state[0], kf_y_state[0] = current_pos[0], current_pos[1]
                
                patch = cv2.resize(frame[y1:y2, x1:x2], fixed_roi_size)
                features = extract_channels(patch)
                target_y_2d = np.fft.ifft2(np.fft.fft2(cv2.getGaussianKernel(fixed_roi_size[1], 18, cv2.CV_32F)) * np.fft.fft2(cv2.getGaussianKernel(fixed_roi_size[0], 18, cv2.CV_32F).T)).real
                model_A, model_B = teaching(features, target_y_2d, 0.01)
                target_y_1d = np.fft.ifft(np.fft.fft(cv2.getGaussianKernel(len(scale_factors), 1, cv2.CV_32F))).real.flatten()
                
                hog_desc_scale = cv2.HOGDescriptor((64, 64), (16, 16), (8, 8), (8, 8), 9)
                descriptor_size_scale = hog_desc_scale.getDescriptorSize()
                scale_features_init = []
                for scale in scale_factors:
                    w_s, h_s = int(w * scale), int(h * scale)
                    x_s, y_s = int(current_pos[0] - w_s / 2), int(current_pos[1] - h_s / 2)
                    patch_s = frame[y_s:y_s+h_s, x_s:x_s+w_s]
                    if patch_s.shape[0] < 16 or patch_s.shape[1] < 16:
                        scale_features_init.append(np.zeros((descriptor_size_scale,)))
                        continue
                    resized = cv2.resize(patch_s, (64, 64))
                    scale_features_init.append(hog_scaling(resized))
                
                SF = np.fft.fft(np.array(scale_features_init), axis=0)
                scale_model_A = np.conj(SF) * target_y_1d[:, np.newaxis]
                scale_model_B = np.sum(np.conj(SF) * SF, axis=1)
                tracking = True
                cv2.rectangle(frame, (x1 , y1 ), (x2, y2), (0, 0, 255), 2)
                # break
    else:
        kf_x_state, kf_x_p = kalman_prediction(F, kf_x_state, kf_x_p, Q)
        kf_y_state, kf_y_p = kalman_prediction(F, kf_y_state, kf_y_p, Q)
        pred_pos = (kf_x_state[0, 0], kf_y_state[0, 0])
        current_scale = scaled_check(frame, pred_pos, current_size, scale_factors, scale_model_A, scale_model_B, 0.01)
        current_size = (current_size[0] * current_scale, current_size[1] * current_scale)
        
        search_area_scale = 1.5
        w_search, h_search = int(current_size[0] * search_area_scale), int(current_size[1] * search_area_scale)
        x_search, y_search = int(pred_pos[0] - w_search/2), int(pred_pos[1] - h_search/2)
        
        search_patch = frame[y_search:y_search+h_search, x_search:x_search+w_search]
        if search_patch.shape[0] > 0 and search_patch.shape[1] > 0:
            resized_patch = cv2.resize(search_patch, fixed_roi_size)
            features = extract_channels(resized_patch)
            Z = np.fft.fft2(features, axes=(0, 1))
            
            H_filter = model_A / (np.expand_dims(model_B, axis=2) + 0.01)
            response_f = np.sum(np.conj(H_filter) * Z, axis=2)
            response = np.real(np.fft.ifft2(response_f))
            
            peak_y, peak_x = np.unravel_index(np.argmax(response), response.shape)
            
            if peak_y > fixed_roi_size[1] / 2:
                peak_y -= fixed_roi_size[1]
            if peak_x > fixed_roi_size[0] / 2:
                peak_x -= fixed_roi_size[0]
            
            dx = (peak_x / fixed_roi_size[0]) * w_search
            dy = (peak_y / fixed_roi_size[1]) * h_search
            
            measured_pos = (pred_pos[0] + dx, pred_pos[1] + dy)
            kf_x_state, kf_x_p = kalman_updating(kf_x_state, kf_x_p, H_kalman, measured_pos[0], R)
            kf_y_state, kf_y_p = kalman_updating(kf_y_state, kf_y_p, H_kalman, measured_pos[1], R)
            
            current_pos = (kf_x_state[0, 0], kf_y_state[0, 0])
            
            x1_up, y1_up = int(current_pos[0] - current_size[0]/2), int(current_pos[1] - current_size[1]/2)
            w1_up, h1_up = int(current_size[0]), int(current_size[1])
            update_patch = frame[y1_up:y1_up+h1_up, x1_up:x1_up+w1_up]
            if update_patch.shape[0] > 0 and update_patch.shape[1] > 0:
                resized_patch_up = cv2.resize(update_patch, fixed_roi_size)
                features_new = extract_channels(resized_patch_up)
                target_y_2d = np.fft.ifft2(np.fft.fft2(cv2.getGaussianKernel(fixed_roi_size[1], 18, cv2.CV_32F)) * np.fft.fft2(cv2.getGaussianKernel(fixed_roi_size[0], 18, cv2.CV_32F).T)).real
                new_A, new_B = teaching(features_new, target_y_2d, 0.01)
                
                model_A = filter_updating(new_A, model_A, 0.02)
                model_B = filter_updating(new_B, model_B, 0.02)
                target_y_1d_up = np.fft.ifft(np.fft.fft(cv2.getGaussianKernel(len(scale_factors), 1, cv2.CV_32F))).real.flatten()
                scale_features_new = []
                for scale in scale_factors:
                    w_s, h_s = int(current_size[0] * scale), int(current_size[1] * scale)
                    x_s, y_s = int(current_pos[0] - w_s / 2), int(current_pos[1] - h_s / 2)
                    patch_s = frame[y_s:y_s+h_s, x_s:x_s+w_s]
                    if patch_s.shape[0] < 16 or patch_s.shape[1] < 16:
                        scale_features_new.append(np.zeros((1764,)))
                        continue
                    resized = cv2.resize(patch_s, (64, 64))
                    scale_features_new.append(hog_scaling(resized))
                SF_new = np.fft.fft(np.array(scale_features_new), axis=0)
                new_scale_A = np.conj(SF_new) * target_y_1d_up[:, np.newaxis]
                new_scale_B = np.sum(np.conj(SF_new) * SF_new, axis=1)
                scale_model_A = filter_updating(new_scale_A, scale_model_A, 0.02)
                scale_model_B = filter_updating(new_scale_B, scale_model_B, 0.02)
        x_draw, y_draw, w_draw, h_draw = int(current_pos[0]-current_size[0]/2), int(current_pos[1]-current_size[1]/2), int(current_size[0]), int(current_size[1])
        cv2.rectangle(frame, (x_draw, y_draw), (x_draw + w_draw, y_draw + h_draw), (0, 255, 0), 2)
        cv2.putText(frame, "person", (x_draw, y_draw - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    fps_frame_count += 1
    if (time.time() - fps_start_time) > 1:
        fps = fps_frame_count / (time.time() - fps_start_time)
        fps_frame_count = 0
        fps_start_time = time.time()
    fps_text = f"FPS: {fps:.2f}"
    cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.imshow("Advanced Tracker", frame)
    if cv2.waitKey(1000) & 0xFF == 27:
        break
        
cap.release()
cv2.destroyAllWindows()



0: 384x640 1 person, 1 motorcycle, 148.6ms
Speed: 2.8ms preprocess, 148.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)
