In [1]:
!pip install gradio google-api-python-client youtube-transcript-api sentence-transformers scikit-learn pandas numpy




In [None]:
import gradio as gr
import pandas as pd
import numpy as np

from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm





In [10]:
import os
API_KEY = os.environ.get('YOUTUBE_API_KEY')
if API_KEY is None:
    raise RuntimeError('Set YOUTUBE_API_KEY in the environment before running this notebook')

youtube = build("youtube", "v3", developerKey=API_KEY)
model = SentenceTransformer("all-MiniLM-L6-v2")


In [11]:
def get_videos(search_query, max_results=10):
    response = youtube.search().list(
        q=search_query,
        part="id,snippet",
        type="video",
        maxResults=max_results
    ).execute()

    video_ids = []
    titles = []

    for item in response["items"]:
        video_ids.append(item["id"]["videoId"])
        titles.append(item["snippet"]["title"])

    return video_ids, titles


In [12]:
def get_transcripts(video_ids):
    transcripts = []
    for vid in video_ids:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(vid)
            text = " ".join([t["text"] for t in transcript])
            transcripts.append(text)
        except:
            transcripts.append("")
    return transcripts


In [13]:
def semantic_search(youtube_query, user_query):
    video_ids, titles = get_videos(youtube_query)
    transcripts = get_transcripts(video_ids)

    df = pd.DataFrame({
        "video_id": video_ids,
        "title": titles,
        "transcript": transcripts
    })

    df["combined_text"] = df["title"] + " " + df["transcript"]

    video_embeddings = model.encode(df["combined_text"].tolist())
    query_embedding = model.encode(user_query)

    scores = cosine_similarity(
        query_embedding.reshape(1, -1),
        video_embeddings
    )[0]

    df["score"] = scores
    df = df.sort_values("score", ascending=False).head(5)

    md = ""
    for i, row in df.iterrows():
        vid = row["video_id"]
        title = row["title"].replace("|", "-")
        score = f"{row['score']:.2f}"

        video_url = f"https://www.youtube.com/watch?v={vid}"
        thumbnail = f"https://img.youtube.com/vi/{vid}/hqdefault.jpg"

        md += f"""
### {title}
[![Thumbnail]({thumbnail})]({video_url})

**Similarity:** `{score}`  
[Watch on YouTube]({video_url})

---
"""
    return md


In [None]:
interface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.Textbox(label="YouTube Search Topic", placeholder="fastapi tutorial"),
        gr.Textbox(label="Your Question", placeholder="how to build backend for ML app")
    ],
    outputs=gr.Markdown(label="Top Matching Videos"),
    title="AI QueryTube üîç",
    description="Semantic YouTube search using transcripts and transformer embeddings"
)

interface.launch(share=True)



* Running on local URL:  http://127.0.0.1:7862

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
