In [None]:
video_path = '/content/ball.mp4'
import cv2
import numpy as np

!pip install supervision
import supervision as sv

byte_tracker = sv.ByteTrack()
cap = cv2.VideoCapture(video_path)

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter('output.mp4', fourcc, 30.0, (1280,  720))

in_car = 0
out_car = 0

while cap.isOpened():

    cap = cv2.VideoCapture(video_path)

    # Read the first frame
    ret, first_frame = cap.read()
    if not ret:
        print("Failed to load the video.")
        exit()

    # Select the bounding box
    detection = cv2.selectROI(first_frame)

    # # convert to Detections
    # detections = sv.Detections.from_ultralytics(results)
    # # only consider class id from selected_classes define above
    # detections = detections[np.isin(detections.class_id, selected_classes)]

    detections = byte_tracker.update_with_detections(detection)
    frame = box_anotator(frame, detections)


    out.write(frame)

# Release everything if job is finished
cap.release()
out.release()
cv2.destroyAllWindows()

In [3]:
import torch
import torch.nn as nn
import torchvision.models as models

class ResNetBackbone(nn.Module):
    def __init__(self, pretrained=True):
        super(ResNetBackbone, self).__init__()
        resnet = models.resnet50(pretrained=pretrained)
        # Extract layers up to conv4
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-2])

    def forward(self, x):
        return self.feature_extractor(x)

class Neck(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Neck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        return self.relu(x)

class SiamRPNHead(nn.Module):
    def __init__(self, in_channels, num_anchors):
        super(SiamRPNHead, self).__init__()
        self.cls_head = nn.Conv2d(in_channels, num_anchors * 2, kernel_size=1)
        self.reg_head = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1)

    def forward(self, x):
        cls_output = self.cls_head(x)  # Classification output
        reg_output = self.reg_head(x)  # Regression output
        return cls_output, reg_output

class SiamRPNPP(nn.Module):
    def __init__(self, backbone, neck, head):
        super(SiamRPNPP, self).__init__()
        self.backbone = backbone
        self.neck = neck
        self.head = head

    def forward(self, template, search):
        # Extract features from template and search images
        template_feat = self.backbone(template)
        search_feat = self.backbone(search)

        # Pass features through neck
        template_feat = self.neck(template_feat)
        search_feat = self.neck(search_feat)

        # Correlation (or cross-correlation)
        corr_feat = self.cross_correlation(template_feat, search_feat)

        # Pass through classification and regression heads
        cls_output, reg_output = self.head(corr_feat)
        return cls_output, reg_output

    def cross_correlation(self, template, search):
        # Perform depthwise cross-correlation
        batch_size, c, h, w = search.size()
        template = template.view(batch_size, c, -1).permute(0, 2, 1)
        search = search.view(batch_size, c, -1)
        corr = torch.matmul(template, search)  # Batch matrix multiplication
        corr = corr.view(batch_size, 1, h, w)
        return corr

# Instantiate the components
backbone = ResNetBackbone(pretrained=True)
neck = Neck(in_channels=2048, out_channels=256)
head = SiamRPNHead(in_channels=256, num_anchors=5)

# Create the SiamRPN++ model
model = SiamRPNPP(backbone, neck, head)

# # Example usage
# template = torch.randn(1, 3, 127, 127)  # Template image
# search = torch.randn(1, 3, 255, 255)    # Search image
# cls_output, reg_output = model(template, search)
# print("Classification output shape:", cls_output.shape)
# print("Regression output shape:", reg_output.shape)


In [2]:
# Example usage
template = torch.randn(1, 3, 127, 127)  # Template image
search = torch.randn(1, 3, 255, 255)    # Search image
cls_output, reg_output = model(template, search)
print("Classification output shape:", cls_output.shape)
print("Regression output shape:", reg_output.shape)


RuntimeError: shape '[1, 1, 8, 8]' is invalid for input of size 1024

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

# Backbone with ResNet
class ResNetBackbone(nn.Module):
    def __init__(self):
        super(ResNetBackbone, self).__init__()
        resnet = models.resnet50(pretrained=True)  # Load pretrained ResNet-50
        self.features = nn.Sequential(*list(resnet.children())[:-2])  # Remove fully connected layer and avgpool

    def forward(self, x):
        return self.features(x)

# Neck (e.g., Adjust Layer)
class Neck(nn.Module):
    def __init__(self, in_channels=2048, out_channels=256):
        super(Neck, self).__init__()
        self.adjust = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)

    def forward(self, x):
        return self.adjust(x)

# RPN Head
class RPNHead(nn.Module):
    def __init__(self, in_channels=256, anchor_num=5):
        super(RPNHead, self).__init__()
        self.cls = nn.Conv2d(in_channels, anchor_num * 2, kernel_size=1)  # Classification
        self.reg = nn.Conv2d(in_channels, anchor_num * 4, kernel_size=1)  # Regression

    def forward(self, x):
        cls_out = self.cls(x)
        reg_out = self.reg(x)
        return cls_out, reg_out

# SiamRPN++ Model
class SiamRPNPlusPlus(nn.Module):
    def __init__(self):
        super(SiamRPNPlusPlus, self).__init__()
        self.backbone = ResNetBackbone()
        self.neck = Neck()
        self.head = RPNHead()

    def forward(self, z, x):
        z_feat = self.backbone(z)
        x_feat = self.backbone(x)

        z_feat = self.neck(z_feat)
        x_feat = self.neck(x_feat)

        response = self.cross_correlation(z_feat, x_feat)
        cls_out, reg_out = self.head(response)
        return cls_out, reg_out

    @staticmethod
    def cross_correlation(z, x):
        N, C, H, W = x.size()
        z = z.view(N, C, -1).permute(0, 2, 1)  # [N, H*W, C]
        x = x.view(N, C, -1)  # [N, C, H*W]
        response = torch.matmul(z, x).view(N, H, W, H, W)  # [N, H, W, H, W]
        return response

# Webcam-based Tracking Application
import cv2

def select_roi_and_track(model, device):
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Cannot open webcam.")
        return

    _, frame = cap.read()
    roi = cv2.selectROI("Select ROI", frame, fromCenter=False)
    x, y, w, h = map(int, roi)

    template = frame[y:y+h, x:x+w]
    template_tensor = preprocess(template).to(device)

    model.eval()
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        search_tensor = preprocess(frame).to(device)
        with torch.no_grad():
            cls_out, reg_out = model(template_tensor, search_tensor)

        # Use cls_out and reg_out to calculate object position (simplified for this example)
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        cv2.imshow("Tracking", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

def preprocess(frame):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.resize(frame, (255, 255))
    frame = torch.tensor(frame).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    return frame

# Main execution
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SiamRPNPlusPlus().to(device)

    # Optionally load pretrained weights
    pretrained_weights_path = "siamrpnplusplus.pth"
    model.load_state_dict(torch.load(pretrained_weights_path, map_location=device))

    select_roi_and_track(model, device)

if __name__ == "__main__":
    main()


In [None]:
import cv2

# Open webcam
cap = cv2.VideoCapture(0)

# Read the first frame
ret, frame = cap.read()
if not ret:
    print("Failed to grab frame")
    exit()

# Select ROI
roi = cv2.selectROI("Select ROI", frame, fromCenter=False, showCrosshair=True)
cv2.destroyWindow("Select ROI")

# Crop the ROI as the template
x, y, w, h = roi
template = frame[y:y+h, x:x+w]

# Resize the template to 127x127
template = cv2.resize(template, (127, 127))


In [None]:
import torch
from torchvision.transforms import functional as F

while True:
    # Read a new frame
    ret, frame = cap.read()
    if not ret:
        break

    # Define a search region around the last known position
    search_x, search_y = max(x - w, 0), max(y - h, 0)
    search_w, search_h = 3 * w, 3 * h  # Enlarge the search area
    search = frame[search_y:search_y+search_h, search_x:search_x+search_w]

    # Resize the search region to 255x255
    search = cv2.resize(search, (255, 255))

    # Convert template and search images to tensors
    template_tensor = F.to_tensor(template).unsqueeze(0).cuda()
    search_tensor = F.to_tensor(search).unsqueeze(0).cuda()

    # Forward pass through the model
    with torch.no_grad():
        cls_output, reg_output = model(template_tensor, search_tensor)

    # Decode the model's outputs to get the new bounding box
    # (You will need to implement the decoding logic based on your regression output)
    x, y, w, h = decode_bbox(cls_output, reg_output)

    # Update the bounding box and draw it on the frame
    cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

    # Display the frame
    cv2.imshow("Tracking", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
