In [4]:
import os
import re
from googleapiclient.discovery import build
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("YouTube_API_KEY")

def get_video_metadata(api_key, video_id):
    """
    Retrieves metadata for a given YouTube video ID.
    """
    youtube = build('youtube', 'v3', developerKey=api_key)
    
    # Request to get video details
    request = youtube.videos().list(
        part='snippet',
        id=video_id
    )
    
    # Execute the request
    response = request.execute()
    
    # Check if the response contains items
    if 'items' in response and len(response['items']) > 0:
        return response['items'][0]  # Return the metadata of the first item (video)
    else:
        return None  # No video found

def extract_timecodes_and_descriptions(description):
    """
    Extracts timecodes and their associated descriptions from the video description using regex.
    """
    # Regex pattern to match timecodes in the format MM:SS or HH:MM:SS
    timecode_pattern = r"(\d{1,2}:\d{2}(?::\d{2})?)\s+(.+)"
    
    # Find all matches
    matches = re.findall(timecode_pattern, description)
    
    # Extracted timecodes and descriptions
    timecodes = [(match[0], match[1].strip()) for match in matches]
    
    return timecodes

def create_timestamp_dicts(video_id, video_metadata, timecodes):
    """
    Creates a list of dictionaries for each timecode, including title, timecode, text, description, and link.
    """
    base_url = f"https://www.youtube.com/watch?v={video_id}"
    
    timestamp_dicts = []

    for time_str, text in timecodes:
        # Convert timecode to seconds
        parts = time_str.split(":")
        if len(parts) == 2:
            minutes, seconds = map(int, parts)
            time_in_seconds = minutes * 60 + seconds
        elif len(parts) == 3:
            hours, minutes, seconds = map(int, parts)
            time_in_seconds = hours * 3600 + minutes * 60 + seconds
        else:
            continue  # Skip if timecode format is not recognized
        
        # Create a clickable link
        link = f"{base_url}&t={time_in_seconds}s"
        print(text)
        # Create a dictionary for each timestamp
        timestamp_dict = {
            'title': video_metadata['title'],
            'timecode': time_str,
            'text': text,
            'description': video_metadata['description'].split('\n\n')[0],
            'link': link
        }
        
        timestamp_dicts.append(timestamp_dict)
    
    return timestamp_dicts

def get_video_info_and_timestamps(api_key, video_id):
    """
    Main function that retrieves video metadata and creates timestamp dictionaries.
    """
    # Fetch video metadata
    video_metadata_response = get_video_metadata(api_key, video_id)
    
    if not video_metadata_response:
        print("No video found or API error occurred.")
        return []
    
    # Extract relevant metadata
    video_metadata = {
        'title': video_metadata_response['snippet']['title'],
        'description': video_metadata_response['snippet']['description']
    }

    # Extract timecodes and descriptions dynamically from the video description
    timecodes = extract_timecodes_and_descriptions(video_metadata['description'])
    
    # Generate dictionaries for each timecode
    timestamp_dicts = create_timestamp_dicts(video_id, video_metadata, timecodes)
    
    return timestamp_dicts

# Example usage
api_key = API_KEY  # Replace with your actual API key
video_id = 'Q75JgLEXMsM'  # Replace with your actual video ID

# Get video information and timestamp dictionaries
timestamp_dicts = get_video_info_and_timestamps(api_key, video_id)

# Print the result
# for timestamp_dict in timestamp_dicts:
#     print(timestamp_dict)


Introduction to LLM Zoomcamp
Understanding LLMs
Exploring RAG


In [9]:
timestamp_dicts

[{'title': 'LLM Zoomcamp 1.1 - Introduction to LLM and RAG',
  'timecode': '00:00',
  'text': 'Introduction to LLM Zoomcamp',
  'description': "Welcome to the first module of our course, LLM Zoomcamp! We cover the applications of LLM, focusing on RAG: retrieval augmented generation. Throughout the course, we will build a Q&A system using the FAQ data from our courses. We don't cover the theory behind LLMs, but we will learn how to utilize them effectively.",
  'link': 'https://www.youtube.com/watch?v=Q75JgLEXMsM&t=0s'},
 {'title': 'LLM Zoomcamp 1.1 - Introduction to LLM and RAG',
  'timecode': '04:03',
  'text': 'Understanding LLMs',
  'description': "Welcome to the first module of our course, LLM Zoomcamp! We cover the applications of LLM, focusing on RAG: retrieval augmented generation. Throughout the course, we will build a Q&A system using the FAQ data from our courses. We don't cover the theory behind LLMs, but we will learn how to utilize them effectively.",
  'link': 'https://ww

In [6]:
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

  from tqdm.autonotebook import tqdm, trange
2024-08-30 09:17:56.824141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-30 09:17:56.932850: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-30 09:17:56.933464: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-30 09:17:57.113091: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
