<a href="https://colab.research.google.com/github/Kavya-sri-05/genai/blob/main/youtube_summariser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install youtube-transcript-api langchain faiss-cpu sentence-transformers google-generativeai requests
import os
import re
import time
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Import google.generativeai correctly
import google.generativeai as genai

# Set up the Gemini API key (use your own API key here)
os.environ["GOOGLE_API_KEY"] = "AIzaSyBe7E_z4LXe2AXwfBxAWtHYr87Jfwug09M"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

def extract_video_id(url):
    """Extract the YouTube video ID from a URL."""
    match = re.search(r'(?:youtu\.be\/|youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=))([a-zA-Z0-9_-]{11})', url)
    return match.group(1) if match else None

def get_video_title(video_id):
    """Get the title of a YouTube video."""
    try:
        # Using a simple approach to get the video title from the oembed endpoint
        url = f"https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v={video_id}&format=json"
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('title', 'Unknown Title')
    except Exception as e:
        print(f"Error fetching video title: {e}")
    return "Unknown Title"

def load_and_process_transcript(video_url):
    """Load and process the YouTube video transcript."""
    video_id = extract_video_id(video_url)

    if not video_id:
        print("Invalid YouTube URL or could not extract video ID.")
        return None

    try:
        # Get transcript directly using youtube_transcript_api
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])

        if not transcript_list:
            print("Could not find transcript for this video.")
            return None

        # Get video title for additional context
        video_title = get_video_title(video_id)

        # Combine transcript entries into a single text
        full_transcript = " ".join([entry['text'] for entry in transcript_list])

        # Create a Document object with metadata
        doc = Document(
            page_content=full_transcript,
            metadata={"source": video_url, "title": video_title}
        )

        # Split transcript into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
        docs = text_splitter.split_documents([doc])
        print(f"Successfully processed transcript for '{video_title}'")
        print(f"Transcript split into {len(docs)} chunks.")
        return docs

    except Exception as e:
        print(f"Error processing video: {str(e)}")
        return None

def create_vector_db(docs):
    """Create a vector database from document chunks."""
    # Use HuggingFace embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )

    # Create the vector store
    db = FAISS.from_documents(docs, embeddings)
    return db

def get_response_from_gemini(query, context, max_retries=3, retry_delay=2):
    """Generate a response using Gemini with retry logic for rate limits."""
    prompt_content = f"""
    You are a helpful assistant that answers questions about YouTube videos based on their transcript.

    Answer the following question using only the information provided in the transcript excerpt.
    If the transcript doesn't contain relevant information to answer the question, say "I don't have enough information to answer that question."

    Question: {query}

    Transcript excerpt:
    {context}

    Your answer:
    """

    # Initialize the Gemini model
    model = genai.GenerativeModel("gemini-1.5-pro-latest")

    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt_content)
            return response.text
        except Exception as e:
            if "TooManyRequests" in str(e) or "429" in str(e):
                if attempt < max_retries - 1:
                    wait_time = retry_delay * (2 ** attempt)  # Exponential backoff
                    print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    return "I'm sorry, I'm experiencing high demand right now. Please try again later."
            else:
                return f"Error generating response: {str(e)}"

def answer_question(video_url, question):
    """Main function to process a video and answer a question about it."""
    # Process the transcript
    docs = load_and_process_transcript(video_url)
    if not docs:
        return "Failed to process the video transcript. The video might not have available captions or might be private/restricted."

    # Create vector database
    db = create_vector_db(docs)

    # Search for relevant chunks
    relevant_docs = db.similarity_search(question, k=4)
    context = "\n\n".join([doc.page_content for doc in relevant_docs])

    # Generate response
    response = get_response_from_gemini(question, context)
    return response

# Example usage
if __name__ == "__main__":
    # Try with a video known to have English subtitles
    video_url = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'  # Rick Astley - Never Gonna Give You Up (known to have captions)

    question = "Can you summarize the video?"

    print(f"Processing video: {video_url}")
    answer = answer_question(video_url, question)
    print("\nQuestion:", question)
    print("\nAnswer:", answer)

Processing video: https://www.youtube.com/watch?v=dQw4w9WgXcQ
Successfully processed transcript for 'Rick Astley - Never Gonna Give You Up (Official Music Video)'
Transcript split into 3 chunks.


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Question: Can you summarize the video?

Answer: The singer expresses their feelings for someone they've known for a long time. They acknowledge unspoken feelings between them and declare they will never give up, let down, run around, desert, make cry, say goodbye to, or lie to and hurt this person.  The lyrics repeat the lines "Never gonna give you up, never gonna let you down..." throughout the song, emphasizing the singer's commitment. They also allude to a game both are playing and urge the other person to be honest about their feelings.



In [26]:
# First, uninstall any existing versions to avoid conflicts
!pip uninstall -y google-generativeai
!pip uninstall -y genai

# Install the correct package
!pip install --upgrade google-generativeai

Found existing installation: google-generativeai 0.8.4
Uninstalling google-generativeai-0.8.4:
  Successfully uninstalled google-generativeai-0.8.4
[0mCollecting google-generativeai
  Downloading google_generativeai-0.8.4-py3-none-any.whl.metadata (4.2 kB)
Downloading google_generativeai-0.8.4-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-generativeai
Successfully installed google-generativeai-0.8.4
