In [51]:
import re
import requests
import logging
from youtube_transcript_api import YouTubeTranscriptApi
from rank_bm25 import BM25Okapi
import googleapiclient.discovery
import googleapiclient.errors
import google.generativeai as genai
from sentence_transformers import SentenceTransformer, util

In [52]:
# Initialize Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [53]:
import os
import dotenv
from dotenv import load_dotenv

In [54]:
# Your API key and Programmable Search Engine ID
GOOGLE_API_KEY = os.getenv('GOOGLE_CUSTOM_SEARCH_KEY')
SEARCH_ENGINE_ID = os.getenv('CUSTOM_SEARCH_ENGINE_ID')

print('key', GOOGLE_API_KEY, 'csid:', SEARCH_ENGINE_ID)

key AIzaSyCAV73EKedKhVm3Vslz389wY6_OB1z2aw0 csid: 74a9c6ca4ecd7403c


In [55]:
# Configure the Google Generative AI
load_dotenv()
genai.configure(api_key=os.getenv('GOOGLE_GEMINI_API_KEY'))

In [56]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [57]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

2025-03-26 23:30:39,390 - INFO - Use pytorch device_name: cpu
2025-03-26 23:30:39,391 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [58]:
# 1. Extract YouTube Video ID & Transcript
def extract_video_id(url):
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return match.group(1) if match else None

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join(entry['text'] for entry in transcript)
        return re.sub(r'\s+', ' ', text).strip()
    except Exception as e:
        return f"Error: {e}"

In [59]:
# 2. Fetch Web Search Results
def google_search(query, num_results=5):
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'q': query,
        'key': GOOGLE_API_KEY,
        'cx': SEARCH_ENGINE_ID,
        'num': num_results
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an error for HTTP issues
        results = response.json()
        snippets = [item['snippet'] for item in results.get("items", [])]
        logging.info("Web search results fetched successfully.")
        return snippets
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching search results: {e}")
        return []

In [60]:
# 3. BM25-Based Ranking
def rank_snippets(query, snippets):
    if not snippets:
        return []  # Avoid ZeroDivisionError
    tokenized_corpus = [snippet.split() for snippet in snippets]
    bm25 = BM25Okapi(tokenized_corpus)
    scores = bm25.get_scores(query.split())
    ranked_snippets = sorted(zip(snippets, scores), key=lambda x: x[1], reverse=True)
    logging.info("BM25 ranking completed.")
    return [snippet for snippet, _ in ranked_snippets[:3]]

In [61]:
# 4. LLM-Based Filtering

def rank_relevance(query, snippets):
    if not snippets:
        return []
    scored_snippets = []
    for snippet in snippets:
        prompt = f"On a scale of 1-10, how relevant is this snippet to '{query}'? Reply with ONLY the number:\n\n{snippet}"
        try:
            response = model.generate_content(prompt)
            raw_score = response.text.strip()
            match = re.search(r'\b([1-9]|10)\b', raw_score)
            if match:
                score = int(match.group(1))
                scored_snippets.append((snippet, score))
        except Exception as e:
            logging.error(f"Error in LLM-based ranking: {e}")
            scored_snippets.append((snippet, 0))
    logging.info("LLM-based relevance filtering completed.")
    return sorted(scored_snippets, key=lambda x: x[1], reverse=True)[:3]

In [62]:
# 5. Embedding-Based Re-ranking
def embedding_rerank(query, snippets):
    if not snippets:
        return []
    snippets = [snippet if isinstance(snippet, str) else snippet[0] for snippet in snippets]  # Ensure pure strings

    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    snippet_embeddings = embedding_model.encode(snippets, convert_to_tensor=True)

    if snippet_embeddings.shape[0] == 0:
        logging.warning("No valid embeddings found for snippets.")
        return []  # Avoid empty matrix multiplication error
    
    similarities = util.pytorch_cos_sim(query_embedding, snippet_embeddings)[0]
    ranked_snippets = sorted(zip(snippets, similarities), key=lambda x: x[1], reverse=True)
    
    logging.info("Embedding-based ranking completed.")
    return [snippet for snippet, _ in ranked_snippets[:3]]

In [63]:
# 6. Generate Final Response
def generate_response(video_transcript, web_snippets, query):
    combined_content = f"Relevant Video Transcript:\n{video_transcript[:1000]}\n\nRelevant Web Snippets:\n" + "\n".join(web_snippets)
    prompt = f"Answer the following query using the provided content:\nQuery: {query}\n\n{combined_content}"
    response = model.generate_content(prompt)
    return response.text.strip()

In [None]:
# 🔥 Example Usage
url = "https://www.youtube.com/watch?v=UeTOW5exFmE"
query = "FIFA World Cup 2026 latest updates"

video_id = extract_video_id(url)
video_transcript = get_transcript(video_id)
web_results = google_search(query)
bm25_ranked = rank_snippets(query, web_results)
llm_filtered = rank_relevance(query, bm25_ranked)
embedding_ranked = embedding_rerank(query, llm_filtered)
final_response = generate_response(video_transcript, embedding_ranked, query)

print("\n🚀 Final AI Response:\n", final_response)