# 1. Setup & Imports
This segment installs any required dependencies and imports the necessary libraries.

In [1]:
# Install necessary libraries if not already installed
# !pip install tensorflow numpy opencv-python-headless matplotlib scikit-learn

import os
import cv2
import gc
import numpy as np
import h5py
import json
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score
import xml.etree.ElementTree as ET

import torch
from torch import nn
from torch.autograd import Variable
from torchvision import transforms
from torchvision.models import resnet50
import torch.nn.functional as F

In [2]:
# Set paths
DATASET_PATH = "dataset/personpath22/raw_data/pathtrack/pathtrack_release/"
TRAIN_PATH = os.path.join(DATASET_PATH, "train")
TEST_PATH = os.path.join(DATASET_PATH, "test")
TRAIN_HDF5_FILE = os.path.join("dataset", "personpath22", "pathtrack_matched", "train", "merged_data.h5")
TEST_HDF5_FILE = os.path.join("dataset", "personpath22", "pathtrack_matched", "test", "merged_data.h5")
TRAIN_INDIVIDUAL_H5_DIR = os.path.join("dataset", "personpath22", "pathtrack_matched", "train")
TEST_INDIVIDUAL_H5_DIR = os.path.join("dataset", "personpath22", "pathtrack_matched", "test")
FEATURES_DIR = os.path.join("dataset", "personpath22", "features")

# Create directories if they don't exist
os.makedirs(TRAIN_INDIVIDUAL_H5_DIR, exist_ok=True)
os.makedirs(TEST_INDIVIDUAL_H5_DIR, exist_ok=True)
os.makedirs(FEATURES_DIR, exist_ok=True)

# 2. Path and Data Loading Configuration
This part helps you configure paths to your dataset.

In [3]:
# Function to check if a sequence exists in an HDF5 file
def sequence_exists(h5_file_path, sequence_name):
    if os.path.exists(h5_file_path):
        with h5py.File(h5_file_path, 'r') as h5f:
            return sequence_name in h5f
    return False

# Function to parse info.xml and extract relevant data
def parse_info_xml(info_xml_path):
    tree = ET.parse(info_xml_path)
    root = tree.getroot()
    info_data = {elem.tag: elem.text for elem in root.find('doc')}
    return info_data

# Function to save frames, annotations, and info.xml data to HDF5 files
def save_frames_and_annotations_to_hdf5(data_path, h5_file_path, individual_h5_dir, batch_size=10):
    sequence_count = 0

    with h5py.File(h5_file_path, 'a') as h5f:
        for dir_name in os.listdir(data_path):
            dir_path = os.path.join(data_path, dir_name)

            if os.path.isdir(dir_path):
                sequence_name = os.path.basename(dir_path)
                individual_h5_file = os.path.join(individual_h5_dir, f"{dir_name}.h5")

                if sequence_exists(h5_file_path, sequence_name) and sequence_exists(individual_h5_file, sequence_name):
                    print(f"Skipping '{sequence_name}' as it already exists.")
                    continue

                with h5py.File(individual_h5_file, 'a') as h5_individual:
                    for subdir_name in os.listdir(dir_path):
                        subdir_path = os.path.join(dir_path, subdir_name)

                        if subdir_name == "img1":
                            frame_dataset = h5f.create_dataset(
                                f"{sequence_name}/frames", shape=(0, 224, 224, 3),
                                maxshape=(None, 224, 224, 3), chunks=True, dtype='uint8'
                            )
                            ind_frame_dataset = h5_individual.create_dataset(
                                f"{sequence_name}/frames", shape=(0, 224, 224, 3),
                                maxshape=(None, 224, 224, 3), chunks=True, dtype='uint8'
                            )

                            for img_name in sorted(os.listdir(subdir_path)):
                                img_path = os.path.join(subdir_path, img_name)
                                img = cv2.imread(img_path)
                                if img is not None:
                                    resized_img = cv2.resize(img, (224, 224))
                                    frame_dataset.resize(frame_dataset.shape[0] + 1, axis=0)
                                    frame_dataset[-1] = resized_img
                                    ind_frame_dataset.resize(ind_frame_dataset.shape[0] + 1, axis=0)
                                    ind_frame_dataset[-1] = resized_img

                            annotations_path = os.path.join(dir_path, "gt", "path_annots.json")
                            if os.path.exists(annotations_path):
                                with open(annotations_path, 'r') as f:
                                    annotations = json.load(f)
                                h5f[f"{sequence_name}/annotations"] = json.dumps(annotations).encode('utf-8')
                                h5_individual[f"{sequence_name}/annotations"] = json.dumps(annotations).encode('utf-8')

                            info_xml_path = os.path.join(dir_path, "info.xml")
                            if os.path.exists(info_xml_path):
                                info_data = parse_info_xml(info_xml_path)
                                for key, value in info_data.items():
                                    h5f[f"{sequence_name}"].attrs[key] = value
                                    h5_individual[f"{sequence_name}"].attrs[key] = value

                            print(f"Saved sequence '{sequence_name}'.")

                    sequence_count += 1
                    if sequence_count % batch_size == 0:
                        print(f"Clearing RAM after processing {sequence_count} sequences...")
                        gc.collect()

    print("All sequences have been processed.")

# Save training and test data
save_frames_and_annotations_to_hdf5(TRAIN_PATH, TRAIN_HDF5_FILE, TRAIN_INDIVIDUAL_H5_DIR)
save_frames_and_annotations_to_hdf5(TEST_PATH, TEST_HDF5_FILE, TEST_INDIVIDUAL_H5_DIR)

Skipping '-bnYpCiwV2Q_301_308' as it already exists.
Skipping '-cKE8pyfcZc_38_46' as it already exists.
Skipping '-DGzHCfmv5k_23_30' as it already exists.
Skipping '-DGzHCfmv5k_92_97' as it already exists.
Skipping '-fe32cvcpDI_378_388' as it already exists.
Skipping '-kX6T3DkExg_85_94' as it already exists.
Skipping '-LRxcWBl7P8_112_124' as it already exists.
Skipping '0-ENiq-1R5M_226_240' as it already exists.
Skipping '00np___nE5s_314_322' as it already exists.
Skipping '00np___nE5s_394_404' as it already exists.
Skipping '00np___nE5s_465_471' as it already exists.
Skipping '00np___nE5s_481_506' as it already exists.
Skipping '03tAll3Rnb8_145_160' as it already exists.
Skipping '08u_l3VMtFM_204_243' as it already exists.
Skipping '08u_l3VMtFM_848_861' as it already exists.
Skipping '08u_l3VMtFM_93_116' as it already exists.
Skipping '08xgGFk5fds_739_749' as it already exists.
Skipping '08xgGFk5fds_762_768' as it already exists.
Skipping '08xgGFk5fds_905_915' as it already exists.
Sk

In [None]:
# # Function to process a sequence
# def process_sequence(sequence_name):
#     individual_h5_file = os.path.join(TRAIN_INDIVIDUAL_H5_DIR, f"{sequence_name}.h5")
    
#     if not os.path.exists(individual_h5_file):
#         print(f"Error: Individual HDF5 file for '{sequence_name}' does not exist.")
#         return

#     with h5py.File(individual_h5_file, 'r') as h5_individual:
#         frames = h5_individual[sequence_name]['frames'][:]
        
#         # Load detections from det_rcnn.txt
#         det_rcnn_path = os.path.join(TRAIN_PATH, sequence_name, "det", "det_rcnn.txt")
#         detections = []

#         if os.path.exists(det_rcnn_path):
#             with open(det_rcnn_path, 'r') as f:
#                 det_data = f.readlines()
#                 for line in det_data:
#                     parts = line.strip().split(',')
#                     if len(parts) >= 7:
#                         detections.append({
#                             "frame_number": int(float(parts[0])),  # Convert to float first then to int
#                             "class_id": int(float(parts[1])),      # Convert to float first then to int
#                             "score": float(parts[2]),
#                             "bbox": [float(part) for part in parts[3:7]]
#                         })

#         # Extract features from frames using the CNN model
#         train_features = extract_features_from_frames(frames, cnn_model)
        
#         # Save features in the specified directory
#         np.save(os.path.join(FEATURES_DIR, f"{sequence_name}_features.npy"), train_features)
#         print(f"Extracted features for sequence '{sequence_name}': {train_features.shape}")

#         # Clear memory after processing each sequence
#         del frames
#         del detections
#         gc.collect()  # Force garbage collection

# # Process each sequence in the training set
# for dir_name in os.listdir(TRAIN_PATH):
#     sequence_name = os.path.basename(dir_name)
#     process_sequence(sequence_name)

# print("All sequences have been processed.")

# 3. CNN Model Definition and Feature Extraction
This section defines the ResNet-50 model for feature extraction.

In [7]:
# Define the ResNet model for feature extraction
def create_resnet_model():
    model = resnet50(pretrained=True)
    model = nn.Sequential(*(list(model.children())[:-1]))
    model.eval()
    return model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Function to preprocess frames and extract features
def extract_features_from_frames(frames, model):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    features = []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    with torch.no_grad():
        for frame in frames:
            input_tensor = transform(frame).unsqueeze(0).to(device)
            feature = model(input_tensor).squeeze().cpu().numpy()
            features.append(feature)

    return np.array(features)

# Initialize the CNN model
cnn_model = create_resnet_model()

# Function to process a sequence
def process_sequence(sequence_name):
    individual_h5_file = os.path.join(TRAIN_INDIVIDUAL_H5_DIR, f"{sequence_name}.h5")
    
    if not os.path.exists(individual_h5_file):
        print(f"Error: Individual HDF5 file for '{sequence_name}' does not exist.")
        return

    with h5py.File(individual_h5_file, 'r') as h5_individual:
        frames = h5_individual[sequence_name]['frames'][:]
        
        # Load detections from det_rcnn.txt
        det_rcnn_path = os.path.join(TRAIN_PATH, sequence_name, "det", "det_rcnn.txt")
        detections = []

        if os.path.exists(det_rcnn_path):
            with open(det_rcnn_path, 'r') as f:
                det_data = f.readlines()
                for line in det_data:
                    parts = line.strip().split(',')
                    if len(parts) >= 7:
                        detections.append({
                            "frame_number": int(float(parts[0])),  # Convert to float first then to int
                            "class_id": int(float(parts[1])),      # Convert to float first then to int
                            "score": float(parts[2]),
                            "bbox": [float(part) for part in parts[3:7]]
                        })

        # Extract features from frames using the CNN model
        train_features = extract_features_from_frames(frames, cnn_model)
        
        # Save features in the specified directory
        np.save(os.path.join(FEATURES_DIR, f"{sequence_name}_features.npy"), train_features)
        print(f"Extracted features for sequence '{sequence_name}': {train_features.shape}")

        # Clear memory after processing each sequence
        del frames
        del detections
        gc.collect()  # Force garbage collection

# Process each sequence in the training set
for dir_name in os.listdir(TRAIN_PATH):
    sequence_name = os.path.basename(dir_name)
    process_sequence(sequence_name)

print("All sequences have been processed.")

Extracted features for sequence '-bnYpCiwV2Q_301_308': (174, 2048)
Extracted features for sequence '-cKE8pyfcZc_38_46': (239, 2048)
Extracted features for sequence '-DGzHCfmv5k_23_30': (174, 2048)
Extracted features for sequence '-DGzHCfmv5k_92_97': (125, 2048)
Extracted features for sequence '-fe32cvcpDI_378_388': (239, 2048)
Extracted features for sequence '-kX6T3DkExg_85_94': (269, 2048)
Extracted features for sequence '-LRxcWBl7P8_112_124': (299, 2048)
Extracted features for sequence '0-ENiq-1R5M_226_240': (335, 2048)
Extracted features for sequence '00np___nE5s_314_322': (238, 2048)
Extracted features for sequence '00np___nE5s_394_404': (298, 2048)
Extracted features for sequence '00np___nE5s_465_471': (179, 2048)
Extracted features for sequence '00np___nE5s_481_506': (750, 2048)
Extracted features for sequence '03tAll3Rnb8_145_160': (449, 2048)
Extracted features for sequence '08u_l3VMtFM_204_243': (934, 2048)
Extracted features for sequence '08u_l3VMtFM_848_861': (312, 2048)
Ext

# 4. LSTM Model Definition
This defines the LSTM model that will take the CNN’s output features as input.

In [13]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        x = x.contiguous()  # Ensure the input tensor is contiguous
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return torch.sigmoid(out)

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize LSTM model
input_size = 2048  # ResNet-50 feature size
hidden_size = 128
num_layers = 2
output_size = 1
lstm_model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

# Training setup
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)

# Load extracted features and labels
train_features = []
train_labels = []

for dir_name in os.listdir(TRAIN_PATH):
    sequence_name = os.path.basename(dir_name)
    feature_path = os.path.join(FEATURES_DIR, f"{sequence_name}_features.npy")
    label_path = os.path.join(TRAIN_PATH, sequence_name, "gt", "labels.npy")  # Assuming labels are saved as NumPy arrays

    if os.path.exists(feature_path) and os.path.exists(label_path):
        features = np.load(feature_path)
        labels = np.load(label_path)
        
        train_features.append(features)
        train_labels.append(labels)

# Convert lists to tensors properly
train_features_np = np.array(train_features, dtype=np.float32)  # Convert list to NumPy array first
train_labels_np = np.array(train_labels, dtype=np.float32)

# Check the shape of train_features_np before conversion
print(f"Shape before conversion: {train_features_np.shape}")  # Debugging

# Ensure it's at least 3D (batch_size, seq_len, input_size)
if train_features_np.ndim == 2:  # (batch_size, input_size)
    train_features_np = np.expand_dims(train_features_np, axis=1)  # Add sequence dimension
elif train_features_np.ndim == 1:  # (input_size)
    train_features_np = np.expand_dims(train_features_np, axis=0)  # Add batch dimension
    train_features_np = np.expand_dims(train_features_np, axis=1)  # Add sequence dimension

train_features_tensor = torch.tensor(train_features_np).to(device)
labels_tensor = torch.tensor(train_labels_np).float().unsqueeze(1).to(device)  # Ensure (batch_size, 1)

print(f"Train Features Shape after conversion: {train_features_tensor.shape}")  # Debugging

# Training loop
for epoch in range(NUM_EPOCHS):
    lstm_model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = lstm_model(train_features_tensor)
    loss = criterion(outputs, labels_tensor)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {loss.item():.4f}")


# Real-time tracking function
def run_real_time_tracking(model, cnn_model):
    model.eval()
    model.to(device)

    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Error: Unable to read frame from webcam.")
            break

        frame_resized = cv2.resize(frame, (224, 224))
        frame_tensor = torch.tensor(frame_resized, dtype=torch.float32).permute(2, 0, 1) / 255.0
        frame_tensor = frame_tensor.unsqueeze(0).to(device)

        with torch.no_grad():
            features = cnn_model(frame_tensor)
            features = features.unsqueeze(1)  # Add time dimension
            features = features.contiguous()  # Ensure the tensor is contiguous
            prediction = model(features)  # Forward pass through FC model

        label = "Person" if prediction.item() > 0.5 else "No Person"
        cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow("Real-Time Tracking", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Run the tracking function
run_real_time_tracking(lstm_model, cnn_model)

Using device: cuda
Shape before conversion: (0,)
Train Features Shape after conversion: torch.Size([1, 1, 0])


RuntimeError: input.size(-1) must be equal to input_size. Expected 2048, got 0

# 5. Training with Checkpoints
Include checkpoints to save the best-performing model during training.

In [41]:
# Training setup
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(NUM_EPOCHS):
    lstm_model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = lstm_model(train_features_tensor.to(device))
    loss = criterion(outputs, labels_tensor.to(device))

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {loss.item():.4f}")

RuntimeError: cuDNN error: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguous input.

# 6. Evaluation and Metrics
Calculate the IoU and MOTA metrics for model evaluation

In [42]:
def calculate_iou(box1, box2):
    xA = max(box1[0], box2[0])
    yA = max(box1[1], box2[1])
    xB = min(box1[2], box2[2])
    yB = min(box1[3], box2[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    return interArea / float(box1Area + box2Area - interArea)

# Example usage
iou = calculate_iou([50, 50, 100, 100], [60, 60, 110, 110])
print(f"IoU: {iou:.2f}")

IoU: 0.47


# 7. Real-Time Tracking with Webcam/Video Input
This part enables real-time tracking using your webcam or video input.

In [47]:
def run_real_time_tracking(model, cnn_model):
    model.eval()
    model.to(device)

    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Error: Unable to read frame from webcam.")
            break

        frame_resized = cv2.resize(frame, (224, 224))
        frame_tensor = torch.tensor(frame_resized, dtype=torch.float32).permute(2, 0, 1) / 255.0
        frame_tensor = frame_tensor.unsqueeze(0).to(device)

        with torch.no_grad():
            features = cnn_model(frame_tensor)
            features = features.unsqueeze(1)
            prediction = model(features)

        label = "Person" if prediction.item() > 0.5 else "No Person"
        cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow("Real-Time Tracking", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Run the tracking function
run_real_time_tracking(lstm_model, cnn_model)

Using device: cuda


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x128 and 2048x128)

# 8. Save and Load Model
This part ensures that you save and reload your trained model.

In [None]:
# Save model
torch.save(lstm_model.state_dict(), "lstm_tracking_model.pth")

# Load model for future use
# lstm_model.load_state_dict(torch.load("lstm_tracking_model.pth"))