In [16]:
import re
import requests
import logging
from youtube_transcript_api import YouTubeTranscriptApi
from rank_bm25 import BM25Okapi
import googleapiclient.discovery
import googleapiclient.errors
import google.generativeai as genai
from sentence_transformers import SentenceTransformer, util

In [17]:
# Initialize Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [18]:
import os
import dotenv
from dotenv import load_dotenv

In [19]:
# Your API key and Programmable Search Engine ID
GOOGLE_API_KEY = os.getenv('GOOGLE_CUSTOM_SEARCH_KEY')
SEARCH_ENGINE_ID = os.getenv('CUSTOM_SEARCH_ENGINE_ID')

print('key', GOOGLE_API_KEY, 'csid:', SEARCH_ENGINE_ID)

key AIzaSyCAV73EKedKhVm3Vslz389wY6_OB1z2aw0 csid: 74a9c6ca4ecd7403c


In [20]:
# Configure the Google Generative AI
load_dotenv()
genai.configure(api_key=os.getenv('GOOGLE_GEMINI_API_KEY'))

In [21]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [22]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

2025-03-26 23:40:33,035 - INFO - Use pytorch device_name: cpu
2025-03-26 23:40:33,036 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [23]:
# 1. Extract YouTube Video ID & Transcript
def extract_video_id(url):
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return match.group(1) if match else None

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join(entry['text'] for entry in transcript)
        return re.sub(r'\s+', ' ', text).strip()
    except Exception as e:
        return f"Error: {e}"

In [24]:
# 2. Fetch Web Search Results
def google_search(query, num_results=5):
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'q': query,
        'key': GOOGLE_API_KEY,
        'cx': SEARCH_ENGINE_ID,
        'num': num_results
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an error for HTTP issues
        results = response.json()
        snippets = [item['snippet'] for item in results.get("items", [])]

        print('snippets:', snippets)

        logging.info("Web search results fetched successfully.")
        return snippets
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching search results: {e}")
        return []

In [25]:
# 3. BM25-Based Ranking
def rank_snippets(query, snippets):
    if not snippets:
        return []  # Avoid ZeroDivisionError
    tokenized_corpus = [snippet.split() for snippet in snippets]
    bm25 = BM25Okapi(tokenized_corpus)
    scores = bm25.get_scores(query.split())
    ranked_snippets = sorted(zip(snippets, scores), key=lambda x: x[1], reverse=True)
    logging.info("BM25 ranking completed.")
    return [snippet for snippet, _ in ranked_snippets[:3]]

In [26]:
# 4. LLM-Based Filtering

def rank_relevance(query, snippets):
    if not snippets:
        return []
    scored_snippets = []
    for snippet in snippets:
        prompt = f"On a scale of 1-10, how relevant is this snippet to '{query}'? Reply with ONLY the number:\n\n{snippet}"
        try:
            response = model.generate_content(prompt)
            raw_score = response.text.strip()
            match = re.search(r'\b([1-9]|10)\b', raw_score)
            if match:
                score = int(match.group(1))
                scored_snippets.append((snippet, score))
        except Exception as e:
            logging.error(f"Error in LLM-based ranking: {e}")
            scored_snippets.append((snippet, 0))
    logging.info("LLM-based relevance filtering completed.")
    return sorted(scored_snippets, key=lambda x: x[1], reverse=True)[:3]

In [27]:
# 5. Embedding-Based Re-ranking
def embedding_rerank(query, snippets):
    if not snippets:
        return []
    snippets = [snippet if isinstance(snippet, str) else snippet[0] for snippet in snippets]  # Ensure pure strings

    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    snippet_embeddings = embedding_model.encode(snippets, convert_to_tensor=True)

    if snippet_embeddings.shape[0] == 0:
        logging.warning("No valid embeddings found for snippets.")
        return []  # Avoid empty matrix multiplication error
    
    similarities = util.pytorch_cos_sim(query_embedding, snippet_embeddings)[0]
    ranked_snippets = sorted(zip(snippets, similarities), key=lambda x: x[1], reverse=True)
    
    logging.info("Embedding-based ranking completed.")
    return [snippet for snippet, _ in ranked_snippets[:3]]

In [None]:
# 6. Generate Final Response
def generate_response(video_transcript, web_snippets, query):
    combined_content = f"""
    Below is relevant content extracted from a YouTube video transcript and web sources.

    === Video Transcript (Extract) ===
    {video_transcript[:2000]}  # Increased transcript length

    === Web Snippets ===
    {"\n\n".join(web_snippets)}

    === Task ===
    Provide a **detailed, structured response** to the query: "{query}". 
    - **Contextualize the information** and explain its significance.
    - **Compare different sources**, highlighting key points.
    - If applicable, **predict future trends or suggest actions**.
    - Ensure **clarity, coherence, and depth**.
    """

    response = model.generate_content(combined_content)
    logging.info("Final assistant-style response generated.")
    return response.text.strip()

In [29]:
# 🔥 Example Usage
url = "https://www.youtube.com/watch?v=UeTOW5exFmE"
query = "When will the event end and what would be the ticket pricing?"

video_id = extract_video_id(url)
video_transcript = get_transcript(video_id)
web_results = google_search(query)
bm25_ranked = rank_snippets(query, web_results)
llm_filtered = rank_relevance(query, bm25_ranked)
embedding_ranked = embedding_rerank(query, llm_filtered)
final_response = generate_response(video_transcript, embedding_ranked, query)

print("\n🚀 Final AI Response:\n", final_response)

2025-03-26 23:40:40,450 - INFO - Web search results fetched successfully.
2025-03-26 23:40:40,451 - INFO - BM25 ranking completed.


snippets: ["Dec 17, 2024 ... “The FTC's rule will put an end to junk fees around live event tickets ... The Junk Fees Rule will ensure that pricing information is\xa0...", 'The closer you book to the date of your visit, the higher the ticket price may be. ... All purchases for this event should be made below to ensure authentic\xa0...', "The ticket does not cover the cost of riding Muni's cable car lines. The ticket will be valid all day and expire at the end of the service day at 2:00 am after\xa0...", 'Nov 1, 2023 ... Note: If you do not set an End Time for your event, online registration will automatically close at 12:00am on the End Date. You will need to\xa0...', 'Lightning Lane Multi Pass is not available during the event. Parking fees are not included in the ticket price. Tickets must be purchased online and are not\xa0...']


2025-03-26 23:40:43,726 - INFO - LLM-based relevance filtering completed.
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.65it/s]
2025-03-26 23:40:43,820 - INFO - Embedding-based ranking completed.



🚀 Final AI Response:
 The provided text does not contain the end date of the FIFA World Cup 2026 or its ticket pricing.  The transcript only mentions that it will be held in 2026 and details about the format of the tournament.  The web snippets discuss ticket pricing in general terms (prices increase closer to the event date) but do not refer to FIFA World Cup 2026 ticket prices specifically.
