In [1]:
import torch
from torch import nn
from config import IMAGE_TRANSFORM, VIDEO_TRANSFORM
from torch.utils.data import Dataset, DataLoader
import PIL
from PIL import Image
import os
from preprocessing import uniform_crop, Spacial_Crop, get_clip_timepoints
import matplotlib.pyplot as plt
import clip
from typing import Tuple

from pytorchvideo.data.clip_sampling import ConstantClipsPerVideoSampler
from pytorchvideo import transforms as pv_transforms
from pytorchvideo.data.encoded_video import EncodedVideo
import av

import pexelsapi
from pexelsapi.pexels import Pexels
import dotenv
import requests
import cv2
import numpy as np
from tqdm import tqdm



In [2]:
def load_and_transform_text(texts, device="cpu"):
    if texts is None: return None
    tokens = [clip.tokenize(text).unsqueeze(0).to(device) for text in texts]
    tokens = torch.cat(tokens, dim=0)
    tokens = tokens.squeeze(dim = 1)
    return tokens

In [3]:
class ImageText_DataLoader(Dataset):
    def __init__(self, image_paths, transform, device = "cpu"):
        super().__init__()
        
        if image_paths is None: return None
        
        self.image_outputs = []
        self.text_outputs = []

        for image_path in os.listdir(image_paths):
            if image_path.endswith(".jpg") or image_path.endswith(".jpeg") or image_path.endswith(".png"):
                with open(os.path.join(image_paths, image_path), "rb") as im:
                    image = Image.open(im).convert("RGB")
                    image = transform(img = image)
                    self.image_outputs.append(image)
                    self.text_outputs.append(image_path.split('.')[0])
        self.text_outputs = load_and_transform_text(self.text_outputs)
        # return torch.stack(image_outputs, dim=0)

    def __len__(self):
        return len(self.image_outputs) or len(self.text_outputs)
    
    def __getitem__(self, index):
        return (self.image_outputs[index], self.text_outputs[index])

ds = ImageText_DataLoader(image_paths = "Data/Image_Data", transform = IMAGE_TRANSFORM)
dl = DataLoader(ds, batch_size=5)
next(iter(dl))[1].shape

torch.Size([5, 77])

In [94]:
class VideoText_DataLoader(Dataset):
    def __init__(self, video_paths: str, transform, clip_duration=2, clips_per_video=5, device = "cpu"):
        super().__init__()

        self.video_outputs = []
        self.text_outputs = []
        
        video_transform = transform

        clip_sampler = ConstantClipsPerVideoSampler(
            clip_duration=clip_duration, clips_per_video=clips_per_video, 
        )

        frame_sampler = pv_transforms.UniformTemporalSubsample(num_samples=2)
        for video_path in os.listdir(video_paths): 
            if video_path.endswith(".mp4"):
                encoded_video = EncodedVideo.from_path(
                    file_path=f"Data/Video_Data/{video_path}",
                    decode_audio=False,
                    decoder="pyav"
                )

                all_frames = []

                clip_time_points = get_clip_timepoints(clip_sampler=clip_sampler, duration=encoded_video.duration)

                for clip_time_point in clip_time_points:
                    clip = encoded_video.get_clip(clip_time_point[0], clip_time_point[1])
                    if clip is None: ValueError("No Clip Found") 
                    frames = frame_sampler(clip["video"]) / 255.0

                    all_frames.append(frames)
                all_videos = [video_transform(frame) for frame in all_frames]
                self.video_outputs.append(Spacial_Crop(crop_size = 224, num_crops = 3)(all_videos))
                self.text_outputs.append(video_path.split('.')[0])
        self.text_outputs = load_and_transform_text(self.text_outputs)

    def __len__(self):
        return len(self.video_outputs)
    
    def __getitem__(self, index):
        return self.video_outputs[index], self.text_outputs[index]

ds = VideoText_DataLoader(video_paths = "Data/Video_Data", transform = VIDEO_TRANSFORM)
dl = DataLoader(ds, batch_size=5)
next(iter(dl))[0].shape

torch.Size([5, 15, 3, 2, 224, 224])

In [95]:
next(iter(dl))[1].shape

torch.Size([5, 77])

In [96]:
PEXEL = dotenv.dotenv_values()['PEXEL']

pexel = Pexels(PEXEL)
search_videos = pexel.search_videos(query='ocean', orientation='', size='', color='', locale='', page=1, per_page=15)
search_videos

{'page': 1,
 'per_page': 15,
 'videos': [{'id': 1918465,
   'width': 3840,
   'height': 2160,
   'duration': 15,
   'full_res': None,
   'tags': [],
   'url': 'https://www.pexels.com/video/bird-s-eye-view-of-ocean-waves-1918465/',
   'image': 'https://images.pexels.com/videos/1918465/free-video-1918465.jpg?auto=compress&cs=tinysrgb&fit=crop&h=630&w=1200',
   'avg_color': None,
   'user': {'id': 574687,
    'name': 'Ruvim Miksanskiy',
    'url': 'https://www.pexels.com/@digitech'},
   'video_files': [{'id': 9228888,
     'quality': 'uhd',
     'file_type': 'video/mp4',
     'width': 3840,
     'height': 2160,
     'fps': 23.979999542236328,
     'link': 'https://videos.pexels.com/video-files/1918465/1918465-uhd_3840_2160_24fps.mp4',
     'size': 48292630},
    {'id': 9228993,
     'quality': 'sd',
     'file_type': 'video/mp4',
     'width': 960,
     'height': 540,
     'fps': 23.979999542236328,
     'link': 'https://videos.pexels.com/video-files/1918465/1918465-sd_960_540_24fps.mp4',

In [89]:
def download_direct_video(video_url, output_path="downloaded_video.mp4"):
    response = requests.get(video_url, stream=True)
    total = int(response.headers.get('content-length', 0))
    with open(output_path, 'wb') as file, tqdm(
        desc="Downloading video",
        total=total,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            size = file.write(data)
            bar.update(size)
    return output_path

def resize_with_pad(image: np.array, 
                    new_shape: Tuple[int, int], 
                    padding_color: Tuple[int] = (0, 0, 0)) -> np.array:
    original_shape = (image.shape[1], image.shape[0])
    ratio = float(max(new_shape))/max(original_shape)
    new_size = tuple([int(x*ratio) for x in original_shape])
    image = cv2.resize(image, new_size)
    delta_w = new_shape[0] - new_size[0]
    delta_h = new_shape[1] - new_size[1]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=padding_color)
    return image

def pad_to_square(frame):
    h, w, _ = frame.shape
    size = max(h, w)
    top = (size - h) // 2
    bottom = size - h - top
    left = (size - w) // 2
    right = size - w - left
    padded = cv2.copyMakeBorder(frame, top, bottom, left, right,
                                 cv2.BORDER_CONSTANT, value=[0, 0, 0])  # black padding
    return padded

def process_and_save_video(input_path, output_path_video, output_path_image, name, new_shape=(256, 256), fps=30):
    cap = cv2.VideoCapture(input_path)
    output_path_video = output_path_video + name + '.mp4'

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Use 'XVID' or 'avc1' for AVI/MOV

    out = cv2.VideoWriter(output_path_video, fourcc, fps, new_shape)
    
    frame_count = 0
    frame_flag = False
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        processed = resize_with_pad(frame, new_shape)
        out.write(processed)
        if not frame_flag: plt.imsave(output_path_image + name + '.jpg', cv2.resize(pad_to_square(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)), (256, 256)))
        frame_flag = True
        if frame_count == 180: break
        frame_count += 1

    cap.release()
    out.release()
    print(f"Saved processed video to {output_path_video}")
    os.remove(input_path)

queries = ["cat", "dog", "ocean", "city", "sunset", "baseball"]

for query in queries:
    search_videos = pexel.search_videos(query=query, orientation='', size='', color='', locale='', page=1, per_page=5)

    for idx in range(len(search_videos['videos'])):
        video_info = search_videos['videos'][idx]
        video_url = video_info['video_files'][-1]['link']
        video_path = download_direct_video(video_url)

        url = video_info['url']
        name = ' '.join(url.split('/')[-2].split('-')[:-1])

        process_and_save_video(video_path, output_path_video = f"Data/Video_Data/", output_path_image = "Data/Image_Data/", name = name)

Downloading video: 100%|██████████| 3.77M/3.77M [00:00<00:00, 21.9MB/s]


Saved processed video to Data/Video_Data/video of funny cat.mp4


Downloading video: 100%|██████████| 3.39M/3.39M [00:00<00:00, 20.7MB/s]


Saved processed video to Data/Video_Data/video of a tabby cat.mp4


Downloading video: 100%|██████████| 2.74M/2.74M [00:00<00:00, 20.2MB/s]


Saved processed video to Data/Video_Data/little kitten playing his toy mouse.mp4


Downloading video: 100%|██████████| 5.32M/5.32M [00:00<00:00, 20.9MB/s]


Saved processed video to Data/Video_Data/cute cat.mp4


Downloading video: 100%|██████████| 905k/905k [00:00<00:00, 19.0MB/s]


Saved processed video to Data/Video_Data/the full facial features of a pet cat.mp4


Downloading video: 100%|██████████| 545k/545k [00:00<00:00, 17.4MB/s]


Saved processed video to Data/Video_Data/boy playing with his dog.mp4


Downloading video: 100%|██████████| 11.3M/11.3M [00:00<00:00, 21.4MB/s]


Saved processed video to Data/Video_Data/a dog fights with his reflection in the mirror.mp4


Downloading video: 100%|██████████| 2.40M/2.40M [00:00<00:00, 19.4MB/s]


Saved processed video to Data/Video_Data/dog eating.mp4


Downloading video: 100%|██████████| 4.67M/4.67M [00:00<00:00, 19.3MB/s]


Saved processed video to Data/Video_Data/brown dog relaxing on a sofa.mp4


Downloading video: 100%|██████████| 9.07M/9.07M [00:00<00:00, 21.4MB/s]


Saved processed video to Data/Video_Data/close up of a brown and white pet dog.mp4


Downloading video: 100%|██████████| 4.60M/4.60M [00:00<00:00, 20.9MB/s]


Saved processed video to Data/Video_Data/bird s eye view of ocean waves.mp4


Downloading video: 100%|██████████| 1.64M/1.64M [00:00<00:00, 20.4MB/s]


Saved processed video to Data/Video_Data/waves crashing.mp4


Downloading video: 100%|██████████| 142M/142M [00:07<00:00, 21.2MB/s] 


Saved processed video to Data/Video_Data/waves rushing to the shore.mp4


Downloading video: 100%|██████████| 2.03M/2.03M [00:00<00:00, 34.7MB/s]


Saved processed video to Data/Video_Data/view of the horizon.mp4


Downloading video: 100%|██████████| 523k/523k [00:00<00:00, 18.7MB/s]


Saved processed video to Data/Video_Data/big waves in an ocean.mp4


Downloading video: 100%|██████████| 25.2M/25.2M [00:00<00:00, 28.6MB/s]


Saved processed video to Data/Video_Data/traffic on an intersection road in a city.mp4


Downloading video: 100%|██████████| 359k/359k [00:00<00:00, 25.3MB/s]


Saved processed video to Data/Video_Data/view of city in timelapse mode.mp4


Downloading video: 100%|██████████| 1.72M/1.72M [00:00<00:00, 19.7MB/s]


Saved processed video to Data/Video_Data/aeerial view of city with tall buildings.mp4


Downloading video: 100%|██████████| 4.95M/4.95M [00:00<00:00, 28.8MB/s]


Saved processed video to Data/Video_Data/clouds moving through and above the city.mp4


Downloading video: 100%|██████████| 30.1M/30.1M [00:01<00:00, 27.3MB/s]


Saved processed video to Data/Video_Data/view of the city at dusk.mp4


Downloading video: 100%|██████████| 11.5M/11.5M [00:00<00:00, 28.4MB/s]


Saved processed video to Data/Video_Data/beach waves and sunset.mp4


Downloading video: 100%|██████████| 388k/388k [00:00<00:00, 9.71MB/s]


Saved processed video to Data/Video_Data/sunset.mp4


Downloading video: 100%|██████████| 1.46M/1.46M [00:00<00:00, 38.8MB/s]


Saved processed video to Data/Video_Data/wheat with view of sunset.mp4


Downloading video: 100%|██████████| 1.50M/1.50M [00:00<00:00, 36.2MB/s]


Saved processed video to Data/Video_Data/ocean sunset view.mp4


Downloading video: 100%|██████████| 1.75M/1.75M [00:00<00:00, 32.7MB/s]


Saved processed video to Data/Video_Data/ground level footage of waves breaking on the shore with the sun setting in the horizon.mp4


Downloading video: 100%|██████████| 6.90M/6.90M [00:00<00:00, 24.4MB/s]


Saved processed video to Data/Video_Data/a baseball game in a stadium.mp4


Downloading video: 100%|██████████| 15.5M/15.5M [00:00<00:00, 28.7MB/s]


Saved processed video to Data/Video_Data/close up view of person playing with baseball.mp4


Downloading video: 100%|██████████| 2.14M/2.14M [00:00<00:00, 36.4MB/s]


Saved processed video to Data/Video_Data/aerial view of a baseball field.mp4


Downloading video: 100%|██████████| 608k/608k [00:00<00:00, 30.0MB/s]


Saved processed video to Data/Video_Data/baseball player hitting the baseball.mp4


Downloading video: 100%|██████████| 694k/694k [00:00<00:00, 28.7MB/s]


Saved processed video to Data/Video_Data/aerial view of a baseball field.mp4
