In [1]:
!pip install yt-dlp ffmpeg-python

Collecting yt-dlp
  Downloading yt_dlp-2024.11.18-py3-none-any.whl.metadata (172 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading yt_dlp-2024.11.18-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: yt-dlp, ffmpeg-python
Successfully installed ffmpeg-python-0.2.0 yt-dlp-2024.11.18


In [2]:
import cv2
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
from PIL import Image
import subprocess
import ffmpeg
import json
import sys
import time
from googleapiclient.discovery import build
from urllib.parse import urlparse, parse_qs
from google.colab import userdata
from typing import Dict, List, Tuple

## Top N Frame

In [None]:
class StreamingVideoFrameExtractor:
    def __init__(self):
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def get_video_stream(self, youtube_url):
        cmd = [
            'yt-dlp',
            '-f', 'best[height<=720]',
            '--get-url',
            '--print',
            '{"width": %(width)s, "height": %(height)s, "fps": %(fps)s}',
            youtube_url
        ]

        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

        if process.returncode != 0:
            raise Exception(f"Error getting video stream: {stderr.decode()}")

        lines = stdout.decode().strip().split('\n')
        metadata = json.loads(lines[0])
        stream_url = lines[1]

        return stream_url, metadata

    def create_ffmpeg_pipe(self, stream_url):
        process = (
            ffmpeg
            .input(stream_url)
            .output('pipe:', format='rawvideo', pix_fmt='rgb24')
            .overwrite_output()
            .run_async(pipe_stdout=True, pipe_stderr=True)
        )
        return process

    def update_top_frames(self, top_frames: Dict[int, Tuple[np.ndarray, float]],
                         frame: np.ndarray, score: float, frame_num: int,
                         max_frames: int = 3) -> Dict[int, Tuple[np.ndarray, float]]:
        """Update dictionary of top frames, maintaining only the highest scoring frames"""
        # Add new frame
        top_frames[frame_num] = (frame.copy(), score)

        # Sort by score and keep only the top max_frames
        sorted_frames = dict(sorted(top_frames.items(),
                                  key=lambda x: x[1][1],  # Sort by score
                                  reverse=True)[:max_frames])
        return sorted_frames

    def extract_top_frames_from_stream(self, youtube_url, text_prompt, max_frames=3, sample_rate=30):
        try:
            # Get video stream URL and metadata
            stream_url, metadata = self.get_video_stream(youtube_url)
            width = metadata['width']
            height = metadata['height']

            # print(f"Resolution: {width}x{height}")

            # Create FFmpeg process
            process = self.create_ffmpeg_pipe(stream_url)

            # Encode the text prompt
            text_inputs = self.processor(
                text=text_prompt,
                return_tensors="pt",
                padding=True
            ).to(self.device)
            text_features = self.model.get_text_features(**text_inputs)

            # Dictionary to store top frames: {frame_number: (frame, score)}
            top_frames = {}
            frame_count = 0
            frame_size = width * height * 3

            try:
                while True:
                    # Read sample_rate frames at once
                    frame_data = process.stdout.read(frame_size * sample_rate)
                    if not frame_data:
                        break

                    frame_bytes = frame_data[-frame_size:]
                    frame_count += sample_rate

                    # Convert raw bytes to numpy array
                    frame = np.frombuffer(frame_bytes, np.uint8)
                    frame = frame.reshape([height, width, 3])

                    # Process frame with CLIP
                    image = Image.fromarray(frame)
                    image_inputs = self.processor(
                        images=image,
                        return_tensors="pt",
                        padding=True
                    ).to(self.device)
                    image_features = self.model.get_image_features(**image_inputs)

                    # Calculate similarity score
                    similarity = torch.nn.functional.cosine_similarity(
                        text_features, image_features
                    ).item()

                    # Update top frames
                    top_frames = self.update_top_frames(top_frames, frame, similarity, frame_count, max_frames)

                    # if frame_count % (sample_rate * 10) == 0:
                    #     print(f"Processed {frame_count} frames...")

            finally:
                process.stdout.close()
                process.stderr.close()
                process.wait()

            # Convert to list of (frame, score, frame_number)
            results = [(frame, score, frame_num)
                      for frame_num, (frame, score) in top_frames.items()]
            return sorted(results, key=lambda x: x[1], reverse=True)

        except Exception as e:
            raise Exception(f"Error processing video stream: {str(e)}")

    def save_frames(self, frames, base_output_path, video_index):
        """Save multiple frames to files"""
        saved_paths = []
        print(f"Frames extracted successfully!")
        for i, (frame, score, frame_number) in enumerate(frames):
            output_path = f"{base_output_path}_video-{video_index}_top-{i}.jpg"
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            cv2.imwrite(output_path, frame_bgr)
            saved_paths.append(output_path)
            # print(f"Frame number: {frame_number} | Confidence score: {score:.4f}")
        return saved_paths


def search_youtube_videos(query, api_key, max_results=5):
    """
    Search for YouTube videos matching a query and return their URLs.

    Parameters:
    query (str): Search term
    max_results (int): Maximum number of results to return (default: 5)
    api_key (str): YouTube Data API key (can also be set via YOUTUBE_API_KEY env variable)

    Returns:
    list: List of dictionaries containing video information (title, url, etc.)
    """
    # Initialize YouTube API client
    youtube = build('youtube', 'v3', developerKey=api_key)
    try:
        # Perform the search
        search_response = youtube.search().list(
            q=query,
            part='id,snippet',
            maxResults=max_results,
            type='video',  # Only search for videos, not playlists or channels
            order='relevance'    # Specify the order of results
        ).execute()

        # Process results
        videos = []
        for item in search_response.get('items', []):
            if item['id']['kind'] == 'youtube#video':
                video_id = item['id']['videoId']
                video_info = {
                    'title': item['snippet']['title'],
                    'url': f'https://www.youtube.com/watch?v={video_id}',
                }
                videos.append(video_info)

        return videos

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return []

In [33]:
def main():
    # Example usage
    extractor = StreamingVideoFrameExtractor()
    food = 'lat phat thoke'
    search_query = f'Burmese mukbang {food}'
    max_results = 15
    max_frames = 3
    text_prompt = "A person with the food"
    output_path = search_query.replace(" ", "_")


    # Retreive n YouTube videos url from the query
    api_key = userdata.get('YOUTUBE_API_KEY')
    if not api_key:
        print("Please set your YouTube API key in the YOUTUBE_API_KEY environment variable")
        return

    search_results = search_youtube_videos(search_query, api_key, max_results)
    # print(f"search results: {results}")
    if not search_results:
        print("No matching videos found")
        return

    # extract top m frames from a given YouTube video
    for i, video in enumerate(search_results):
        print(f"Processing video {i+1}/{len(search_results)}")
        print(f"Title: {video['title']} | url: {video['url']}")
        results = extractor.extract_top_frames_from_stream(video['url'], text_prompt, max_frames)
        # frame, score, frame_number = extractor.extract_frame_from_stream(url, text_prompt)
        extractor.save_frames(results, output_path, i)


main()

Processing video 1/10
Title: BURMESE TRADITIONAL FOOD||Burmese Fermented Tea Leaf Salad * Myanmar Lahpet Thoke * | url: https://www.youtube.com/watch?v=TbHMxB3yE1M
Resolution: 640x360
Frame extracted successfully!
Frame number: 18510 | Confidence score: 0.2897
Frame number: 22260 | Confidence score: 0.2880
Frame number: 11970 | Confidence score: 0.2878
Processing video 2/10
Title: Lahpet Thoke - Eating BURMESE TEA LEAF Salad on the Streets of Yangon, Myanmar! | url: https://www.youtube.com/watch?v=Al1lmhbKK0U
Resolution: 640x360
Frame extracted successfully!
Frame number: 1620 | Confidence score: 0.2890
Frame number: 990 | Confidence score: 0.2847
Frame number: 5970 | Confidence score: 0.2837
Processing video 3/10
Title: Chickpea Tofu and Pickled Tea Leaves: Myanmar&#39;s Unreal Cuisine, and Why It&#39;s So Hard to Find | url: https://www.youtube.com/watch?v=tuoNvzSltmI
Resolution: 640x360
Frame extracted successfully!
Frame number: 7710 | Confidence score: 0.2995
Frame number: 12810 |