# Importing Modules

In [1]:
import torch
import timm
import einops
import tqdm
import cv2 as cv2
import numpy as np
import time
import os
import sys
from visualizer import Visualizer
import cv2
import time
from IPython.display import display, clear_output

## Helper Functions

In [2]:
def record_video(duration=5, video_name="output.mp4"):
    # Initialize webcam
    cap = cv2.VideoCapture(0)
    
    # Define video codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(video_name, fourcc, 20.0, (640, 480))
    
    start_time = time.time()
    
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        
        if ret:
            # Write the frame to the output video
            out.write(frame)
            
            # Display the frame in Jupyter Notebook
            clear_output(wait=True)
            display(cv2.imencode('.jpg', frame)[1].tostring())
        
        # Stop recording after the specified duration
        if time.time() - start_time > duration:
            break
    
    # Release the webcam and output video
    cap.release()
    out.release()
    cv2.destroyAllWindows()
    return video_name

In [3]:
def load_video(pth):
    video = cv2.VideoCapture(pth)
    video.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    video.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    video.set(cv2.CAP_PROP_FPS, 30)
    buffer = []
    while True:
        ret, frame = video.read()
        if not ret:
            break
        buffer.append(frame)
    return video, buffer

## Prepare Your Video
An ideal length is at around 10 seconds, also depending on your computer performence.

In [4]:
# Run the function to record a 5-second video
#vid = record_video(5, "my_video.mp4")
vid = "my_video.mp4"
# or optionally set the vid varaible to the path of where you video is

In [5]:
cotracker = torch.hub.load("facebookresearch/co-tracker", "cotracker_w8")

Using cache found in /home/velocitatem/.cache/torch/hub/facebookresearch_co-tracker_main


In [6]:
video, buffer = load_video(vid)
video_tensor = torch.from_numpy(np.array(buffer)).permute(0, 3, 1, 2)[None].float()

In [11]:
video_tensor.size()

torch.Size([1, 145, 3, 480, 640])

**Important** to note that the dimension in this explenation may differ from you video.

The video_tensor object has a shape of EX: `torch.Size([1, 145, 3, 480, 640])`, which breaks down as follows:

- The first dimension `[0]` is extra padding, often used for batch size, with a size of 1.
- The second dimension `[1]` indicates there are 145 frames in the video.
- The third dimension `[2]` represents the 3 color channels (likely RGB).
- The fourth `[3]` and fifth `[4]` dimensions specify the video dimensions, which are 480 pixels in height and 640 pixels in width.

So, for each of the 145 frames, you have a 3-channel color image with dimensions 480x640.

In [8]:
if torch.cuda.is_available():
    cotracker = cotracker.cuda()
    video_tensor = video_torch.cuda()

## Running the Model
In the following code you can modify homnay points you want to track in your video, they will be displayed in a grid.

In [9]:
tracks, visibility = cotracker(
    video_tensor,
    grid_size=4, # here you modify the grid size
    grid_query_frame=0, # here you modify from which frame the points should be tracked
    backward_tracking=False
)

KeyboardInterrupt: 

In [26]:
print(tracks.shape)
print(visibility.shape)

torch.Size([1, 145, 16, 2])
torch.Size([1, 145, 16])


The video_tensor object has a shape of `torch.Size([1, 145, 16, 2])`, indicating its dimensions. Here, 145 frames are present in the video. We've set a 4x4 grid to track specific points, totaling 16 points (4^2). For each of these 145 frames and 16 points, we store a 2D coordinate (x,y) to track the point's position, represented by the last dimension of the tensor. As for the visibility, since we just have it be 0 or 1, we need no extra dimensions

In [27]:
np.save("tracks.npy", tracks.cpu().numpy()) # saving just to be sure
np.save("visibility.npy", visibility.cpu().numpy())

In [28]:
from visualizer import Visualizer

In [29]:
vis = Visualizer(
    save_dir="./", # the directory where the file with the overlayed points will be
    grayscale=False,
    pad_value=100,
    fps=30,
    linewidth=2,
    show_first_frame=5,
    tracks_leave_trace=0
)

In [30]:
output_filename="my_video_tracked"
file = vis.visualize(
    video_tensor,
    tracks=tracks,
    visibility=visibility,
    filename=output_filename, # name of file t
    query_frame=0,
    )

Video saved to ./my_video_tracked_pred_track.mp4


In [31]:
from IPython.display import Video
Video(f"./{output_filename}_pred_track.mp4")