ShanghaiTechTest
    part_A
        test_data (count 5)
            ground_truth
                GT_IMG_{1}.mat
            images
                IMG_{1}.jpg
        train_data (count 20)
            ground_truth
                GT_IMG_{1}.mat
            images
                IMG_{1}.jpg
.mat 
    Nx2 vectors of each person's head x and y coordinates
    N - how many people there are

## Dependencies

In [None]:
import os
import numpy as np
from scipy.io import loadmat
import cv2
from sklearn.model_selection import train_test_split
from skimage.feature import hog, local_binary_pattern
import random
import time

## Load data

test/train sets already given, need to just take the correct pairs and extract crowd count from matlab file

In [2]:
# load .mat file and return x,y coordinates of each persons head
def load_gt_mat(mat_path):
    mat = loadmat(mat_path)
    return mat['image_info'][0][0][0][0][0]

# load image and ground truth coords
def load_sample(img_path, gt_path):
    img = cv2.imread(img_path)
    gt_coords = load_gt_mat(gt_path)
    return img, gt_coords

# load dataset (train or test set with image and corresponding ground truth)
def load_dataset(base_path, dataset_type):
    data = []
    img_dir = os.path.join(base_path, f'{dataset_type}_data', 'images')
    gt_dir = os.path.join(base_path, f'{dataset_type}_data', 'ground-truth')
    
    # get matching .mat and .jpg
    for img_file in os.listdir(img_dir):
        if img_file.endswith('.jpg'):
            img_path = os.path.join(img_dir, img_file)
            gt_file = 'GT_' + img_file.replace('.jpg', '.mat')
            gt_path = os.path.join(gt_dir, gt_file)
            
            if os.path.exists(gt_path):
                image, gt_coords = load_sample(img_path, gt_path)
                data.append((image, gt_coords))

    return data

load data and get validation set

In [None]:
# load datasets and get validation est
base_path = 'ShanghaiTechTest/part_A'

train_data = load_dataset(base_path, 'train')
test_data = load_dataset(base_path, 'test')

train_images, val_images, train_gts, val_gts = train_test_split(
    [item[0] for item in train_data],
    [item[1] for item in train_data],
    test_size = 0.2,
    random_state = 42
)

## Data prep

#### grayscale

In [4]:
def to_grayscale(image):
    return [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in image]

#### normalize

In [5]:
def normalize(image):
    return [img / 255.0 for img in image]

## Data augmentation

randomly change: contrast, brightness or rotate +-15 degrees
P.S. - when applying rotation change head coords to keep the link between them

In [None]:
def augment_img(img, gt_coords):
    alpha = random.uniform(0.8, 1.2) # contrast
    beta = random.uniform(-20, 20) # brightness
    
    # rotation
    angle = random.uniform(-15, 15)
    h, w = img.shape[:2]
    matrix = cv2.getRotationMatrix2D(w / 2, h / 2), angle, 1.0)
    img = cv2.warpAffine(img, matrix, (w, h), borderMode=cv2.BORDER_REFLECT)
    
    # transform coords matrix for density map
    rot_rad = np.deg2rad(angle)
    matrix_coord = np.array([
        [np.cos(rot_rad), -np.sin(rot_rad)],
        [np.sin(rot_rad), np.cos(rot_rad)]
    ])
    
    # transform coords
    gt_coords = np.array(gt_coords) - [w / 2, h / 2] # center
    gt_coords = (matrix_coord @ gt_coords.T).T # rotate
    gt_coords = gt_coords + [w / 2, h / 2]
    return img, gt_coords

## Generate density map

counting 1 to 1 on image (each human) is tricky becous a lot of overlaping, density map instead calculates how many 'mass' in the area, brighter heatmap - more people

In [None]:
# convetr head coords to density map
def generate_density_map(gt_coords, img_shape, sigma=4.0):
    # blank map
    density_map = np.zeros(img_shape[:2], dtype=np.float32)
    # find each coordinate and mark it as 1.0
    for x, y in gt_coords:
        x, y = int(round(x)), int(round(y))
        if 0 <= x < img_shape[1] and 0 <= y < img_shape[0]:
            density_map[y, x] = 1.0
    
    # change point annotation to gaussian blob
    density_map = cv2.GaussianBlur(density_map, (15, 15), sigma)
    # make sure that density map doesn't excel real number of heads
    density_map *= (np.sum(density_map) / len(gt_coords))
    return density_map

## Extract features

#### HOG

hog good for human silhouete detection

In [22]:
def hog_features(img, orientations=9, px_per_cell=(4, 4), cells_per_block=(2, 2)):
    features, hog_img = hog(
        img,
        orientations = orientations, # in how many parts 0-180 degrees divided - 20, 40, 60 ..., 180
        pixels_per_cell = px_per_cell, # smaller value - finer details (good for bigger crowds/smaller images)
        cells_per_block = cells_per_block, # bigger blocks learn more spatial context
        visualize = True,
        block_norm = 'L2-Hys' # normalisation model
    )
    # features - flat HOG descriptors, hog_img - visual of dominant gradients per cell
    return features, hog_img

#### LBP

gets texture, helps seperate people from environment - clothing or hair have different texture from concrete or car

In [23]:
def lbp_features(img, radius=4, n_points=24, method='uniform'):
    lbp = local_binary_pattern(
        img, 
        P = n_points, # compares pixel to other pixels in 24 directions
        R = radius, # after which radius to check pixels
        method = method
    )
    
    hist, _ = np.histogram(lbp, density=True, bins=n_points+2, range=(0, n_points+2))  
    # lbp - texture map, hist - normalized historgram of pattern frequency 
    return lbp, hist 

#### Multi-scale features

find each previous feature on different scale img - 1, 1/2, 1/4, so model can learn to detect small, normal and big people on same image (good when there is big crowd - some people at far distance, some near the camera)

In [24]:
# collect donwscaled img of original img (have 1/1, 1/2 and 1/4 of same image)
def gaussian_pyramid(img, scales=[1.0, 0.5, 0.25]):
    pyramid = []
    for scale in scales:
        if scale == 1.0:
            pyramid.append(img)
        else:
            resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
            pyramid.append(resized)
    return pyramid

# extrat features(hog, lbp) at different resolutions (1, 1/2, 1/4)
def multi_scale_features(img, scales=[1.0, 0.5, 0.25]):
    pyramid = gaussian_pyramid(img, scales)
    features = {
        'hog': [],
        'lbp': []
    }
    
    for scaled_img in enumerate(pyramid):
        hog_feats, _ = hog_features(scaled_img)
        features['hog'].append(hog_feats)
        
        _, lbp_hist = lbp_features(scaled_img)
        features['lbp'].append(lbp_hist)
    return features

## Classification (SVR)

#### train predictions

#### predict

## Results (MAE)