# Apply Transforms to Video datasets

Good demo here (using Kinetics):
https://pytorch.org/vision/stable/auto_examples/plot_video_api.html#sphx-glr-auto-examples-plot-video-api-py 

In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
# torch.manual_seed(69)

import cv2
import numpy as np
# import skvideo.io
import time
import random
from operator import itemgetter
import sys

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# Imports 
# from lhotse import CutSet, RecordingSet, align_with_torchaudio, annotate_with_whisper
# from tqdm import tqdma
# from pprint import pprint
# from dataclasses import asdict
# import torch
# from os import path
# from pydub import AudioSegment
# import sys
# sys.path.append('../../video-pretrained-transformer/')
from whisper_audio.CaptionPreprocessing import CaptionPreprocessing
import subprocess
import os
# Pranav's CLIP Video Pre-Processing

# We want to extract EXACTLY n frames from a video given a start and end time
# We also want to use random sampling if possible (1-30 fps for now)
# def get_frames(video_path, n_frames = 60, words = 15, threshold_words = 30)
#   frame_rate = get_frame_rate(video_path)
#   list_of_start_end_dict = CaptionPreprocessing(video_path)
#   for start_end_dict in list_of_start_end_dict:
#       start, end = start_end_dict['start'], start_end_dict['end']
#       if end - start < n_frames/frame_rate:
#           return ERROR
#       Get random frames

class YoutubeDataPreprocessor:
    def __init__(self, data_path, extension=".mp4", debug=False):
        self.data_path = data_path
        self.extension = extension
        self.debug = debug
        self.caption_preprocessor = CaptionPreprocessing()

    def write_resampled_video(self, video_reader, resample_name, frames_per_partition=64, num_partitions=5, frame_separation=3, extension=".mp4"):
        num_frames, height, width, channels = video_reader.getShape()

        num_frames_processed_per_partition = (frames_per_partition-1)*frame_separation+frames_per_partition
        partitions = []
        
        all_frames = set([i for i in range(num_frames-num_frames_processed_per_partition-1)])

        for partition in range(num_partitions):
            retry = 0
            no_partition_flag = False
            while True:
                if retry == 5:
                    no_partition_flag = True
                    break

                # random_start_idx = random.chouce(0, num_frames-num_frames_processed_per_partition-1)

                # TODO: ADD IN CHECK FOR NUMBER OF WORDS IN SEGMENT
                random_start_idx = random.choice(list(all_frames))
                if True:
                    partition = set([i for i in range(random_start_idx, random_start_idx+num_frames_processed_per_partition)])
                    all_frames = all_frames - partition
                    break

                retry += 1
            
            if no_partition_flag:
                break

            frames_in_partition = []
            curr_frame = random_start_idx

            frames_in_partition = list(range(random_start_idx, random_start_idx+num_frames_processed_per_partition, frame_separation+1))

            assert len(frames_in_partition) == frames_per_partition
            
            partitions.append(frames_in_partition)
        
        if len(partitions) < num_partitions:
            print(f"[WARNING] {resample_name} has less than {num_partitions} (number of partitions: {len(partitions)})")

        partitions = sorted(partitions, key=itemgetter(0))

        writer = None
        curr_partition_end = 0
        curr_frame = 0
        frames_to_extract = []
        partition_num = 0

        curr_partition = 0
        writer = skvideo.io.FFmpegWriter(f"{resample_name}_{curr_partition}{extension}")

        for frame in video_reader.nextFrame():
            if curr_frame >= partitions[curr_partition][-1]:
                if writer is not None:
                    # print("closedd....")
                    writer.close()
                    writer = None

                curr_partition += 1
                writer = skvideo.io.FFmpegWriter(f"{resample_name}_{curr_partition}{extension}")

                if self.debug:
                    print(f"{resample_name}_{curr_partition}{extension}")

            if curr_partition >= len(partitions):
                # print("broken")
                break
            
            if curr_frame in partitions[curr_partition]:
                # print("hit")
                writer.writeFrame(frame)
                
            curr_frame += 1


        # if writer is not None:
        #     writer.close()
        
        return

    def process_video_old(self, video_name):
        video_reader = skvideo.io.FFmpegReader(video_name)
        start = time.time()
        self.write_resampled_video(video_reader, f"./test", frames_per_partition=64, num_partitions=5, frame_separation=3, extension=".mp4")
        end = time.time() - start

        if self.debug:
            print(f"[INFO] video took {end} s for resampling")

        video_reader.close()

    # def get_frames(video_path, n_frames = 60, words = 15, threshold_words = 30)
    #   frame_rate = get_frame_rate(video_path)
    #   list_of_start_end_dict = CaptionPreprocessing(video_path)
    #   for start_end_dict in list_of_start_end_dict:
    #       start, end = start_end_dict['start'], start_end_dict['end']
    #       if end - start < n_frames/frame_rate:
    #           return ERROR
    #       Get random frames

    def get_frame_segment(self, video_name, segment, segment_start, segment_end):
        ''' start here '''
        print(video_name)
        cap = cv2.VideoCapture(video_name)
        fps = cap.get(cv2.CAP_PROP_FPS)

        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))
        
        size = (frame_width, frame_height)
   

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        result = cv2.VideoWriter('./segment_video.mp4', fourcc, fps, size)

        timestamps = [cap.get(cv2.CAP_PROP_POS_MSEC)]
        curr_timestamp = 0.0

        frames = []
        cnt = 0

        while(cap.isOpened()):
            cnt += 1
            # print("function called")
            frame_exists, curr_frame = cap.read()
            
            if frame_exists:
                curr_timestamp = cap.get(cv2.CAP_PROP_POS_MSEC)
                timestamps.append(curr_timestamp)

                # ensure curr_frame is within curr_segment
                if (curr_timestamp/1000.0) > segment_start and (curr_timestamp/1000.0) < segment_end:
                    cv2.imwrite(f"./segment_videos/frame_{cnt}.png", curr_frame)
                    result.write(curr_frame)

                    # todo: make python list of CV2 frames per segment.
                    # todo: Add clip model for embeddings. 
                    
                    # result.write(curr_frame)
                    # cv2.imwrite(f"frame_{curr_timestamp}", curr_frame)
                    # cv2.imshow("test", curr_frame)
                    # cv2.waitKey(0)
                    # cv2.destroyAllWindows()
            else:
                break
          {
            "youtube_name": "2ZokZgnjrYs",                      
            "filename": video_name,                      
            "youtube_id": "Hawaii Vlog",                         
            "segment_length_seconds": 15.0,
            "captions": "Daniel's chosen format here",           
            "segment_start_time":[<segment_start_times_list>],
            "segment_end_time":[<segment_end_times_list>],     
            "frame_embeddings":[<frame_embeddings_list>],        
            "audio_embeddings":[<audio_embeddings_list>],        
            "text_caption_embeddings":[<text_embeddings_list>],  
            "scene_graph_embeddings":[<scene_graph_embeddings_list>],     
            # "labels": [<labels_list>],                       
        }

        cap.release()
        result.release()
        print(min(timestamps), max(timestamps))
        


    def process_video(self, video_name):
        self.caption_preprocessor.process_mp4(video_name)
        
        # return cuts_aligned
        segment_timestamps = self.caption_preprocessor.get_segments_thresholded()
        # print(segment_timestamps)
        
        for segment_timestamp in segment_timestamps:
            self.get_frame_segment(video_name, segment_timestamp, segment_timestamp['start'], segment_timestamp['end'])


       
        return segment_timestamps
    

    def process_video_dataset(self):
        for f in os.listdir(self.data_path):
            if f.endswith(self.extension):
                self.process_video(os.path.join(self.data_path, f))
            else:
                continue



In [None]:
# path = "../../data/massive_youtube_data"
path = "/home/kastan/thesis/data/simple_test_data"
data_preprocessor = YoutubeDataPreprocessor(path)
test_file = '/home/kastan/thesis/data/simple_test_data/rick_roll.mp4'
# test_file = "/home/kastan/thesis/data/whisper_directory/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4"

# import cProfile
# cProfile.run('data_preprocessor.process_video(test_file)')

result = data_preprocessor.process_video(test_file)
# print(result)
# data_preprocessor.process_video(test_file)


In [4]:
# Danny test cell
for i in range(30):
    # path = "../../data/massive_youtube_data"
    
    path = "/home/kastan/thesis/data/simple_test_data"
    data_preprocessor = YoutubeDataPreprocessor(path)
    test_file = '/home/kastan/thesis/data/simple_test_data/rick_roll.mp4'
    # test_file = "/home/kastan/thesis/data/whisper_directory/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4"

    # import cProfile
    # cProfile.run('data_preprocessor.process_video(test_file)')
    result = data_preprocessor.process_video(test_file)
    # print(result)
    # data_preprocessor.process_video(test_file)
    print(i)


0


Scanning audio files (*.wav): 1it [00:00, 889.38it/s]


AssertionError: We don't support forced alignment of cuts with overlapping supervisions (cut ID: 'rick_roll')

In [18]:
data_preprocessor = YoutubeDataPreprocessor(path)
data_preprocessor.get_frame_segment('/home/kastan/thesis/data/simple_test_data/rick_roll.mp4', result[0]['start'], result[0]['end'])

/home/kastan/thesis/data/simple_test_data/rick_roll.mp4
0.0 211560.0


# TODO:(Pranav, Kastan) Define Dataset class

This doesn't work yet, it's just examples from the docs!

In [None]:
class VPT_Dataset(Dataset):
    """TODO: Adapt this data class.
    This doesn't work yet, it's just examples from the docs!
    """

    def __init__(self, csv_file, root_dir, transform=None):
        """
        # TODO: Adapt this
        Args: 
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # self.landmarks_frame = pd.read_csv(csv_file)
        # self.root_dir = root_dir
        self.video_file = []
        self.img_embeddings = []
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # todo: adapt this to our data
        img_name = os.path.join(self.root_dir,
                                self.landmarks_frame.iloc[idx, 0])
        image = io.imread(img_name)
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        landmarks = landmarks.astype('float').reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}

        if self.transform:
            sample = self.transform(sample)

        return sample

# Define Dataset transformations (using Compose)

### Transformations used in X-CLIP

We could probably steal their implementation, too: https://github.com/microsoft/VideoX/blob/master/X-CLIP/datasets/rand_augment.py

```python
_RAND_CHOICE_WEIGHTS_0 = {
    "Rotate": 0.3,
    "ShearX": 0.2,
    "ShearY": 0.2,
    "TranslateXRel": 0.1,
    "TranslateYRel": 0.1,
    "Color": 0.025,
    "Sharpness": 0.025,
    "AutoContrast": 0.025,
    "Solarize": 0.005,
    "SolarizeAdd": 0.005,
    "Contrast": 0.005,
    "Brightness": 0.005,
    "Equalize": 0.005,
    "Posterize": 0,
    "Invert": 0,
}
```

In [None]:
# Torchscript (torch.jit.script) an optimizing JIT runtime compiler for PyTorch. 
# Compiled to C++, faster. I've read data augmentation is CPU-intensive, so this might help.

# 📜 ⭐️ Docs on all transforms: https://pytorch.org/vision/stable/transforms.html 

# Kastan's suggestions (helped by Copilot): 
# RandomRotation(degrees[, interpolation, …])
# RandomHorizontalFlip(p=0.5)
# RandomVerticalFlip(p=0.5)
# RandomResizedCrop(size, scale=(0.08, 1.0), ratio=(0.75, 1.3333333333333333), interpolation=2)
# RandomCrop(size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant')
# RandomAffine(degrees, translate=None, scale=None, shear=None, resample=False, fillcolor=0)
# RandomPerspective(distortion_scale=0.5, p=0.5, interpolation=3, fill=0)
# RandomApply(transforms, p=0.5)
# RandomChoice(transforms)
# RandomOrder(transforms) 
# ColorJitter(brightness=0, contrast=0, saturation=0, hue=0)
# Grayscale(num_output_channels=1)
# Pad(padding, fill=0, padding_mode='constant')
# LinearTransformation(transformation_matrix, mean_vector)
# Normalize(mean, std, inplace=False)
# Resize(size, interpolation=2)

transforms = torch.nn.Sequential(
    transforms.CenterCrop(10),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
)
scripted_transforms = torch.jit.script(transforms)

In [None]:
from torchvision import transforms

video_transforms = transforms.Compose([
    transforms.PILToTensor(),
    transforms.RandomHorizontalFlip(p=0.2),
    transforms.Resize((224, 224)),
    transforms.ConvertImageDtype(torch.float),
])

# TODO:(Pranav, Kastan) Instantiate Dataset

In [None]:
from torch.utils.data import DataLoader

dataset = VPT_Dataset("./dataset", epoch_size=None, transform=video_transforms)


# TODO: This is just inspiration from the docs, pytorch datasets. THIS DOESN'T WORK yet
loader = DataLoader(dataset, batch_size=12)
data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
for batch in loader:
    for i in range(len(batch['path'])):
        data['video'].append(batch['path'][i])
        data['start'].append(batch['start'][i].item())
        data['end'].append(batch['end'][i].item())
        data['tensorsize'].append(batch['video'][i].size())
print(data)

# Data Preprocessing Class

In [None]:
#Testing captionpreprocessing
process = CaptionPreprocessing()
process.load_mp4_to_wav("/home/kastan/thesis/video-pretrained-transformer/data_preprocessing/test_4.mp4")

In [None]:
path = "/home/kastan/thesis/video-pretrained-transformer/data_preprocessing/test_4.wav"
dir = '/'.join(path.split("/")[:-1])


In [None]:
process.get_segments_thresholded()

In [None]:
# Testing videoreader 
import skvideo.io
video_name = '/mnt/storage_hdd/thesis/yt_1b_dataset/yt_1b_train/parallel_0/zoyplKPGXr8_Bill Shuttic_546_Help For Stroke 10x10x10 Challenge.webm'
video_reader = skvideo.io.FFmpegReader(video_name)

for frame in video_reader.nextFrame():
    display(frame)

In [7]:
# test cv2 videoloader
import cv2
print(video_name)
cap = cv2.VideoCapture(video_name)
fps = cap.get(cv2.CAP_PROP_FPS)

/mnt/storage_hdd/thesis/yt_1b_dataset/yt_1b_train/parallel_0/zoyplKPGXr8_Bill Shuttic_546_Help For Stroke 10x10x10 Challenge.webm


TypeError: 'cv2.VideoCapture' object is not iterable