<a href="https://colab.research.google.com/github/Joshika-Mentor/AI-Query-Tube/blob/Ruthika/finaloutput.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio pandas numpy google-api-python-client youtube-transcript-api sentence-transformers scikit-learn


Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.3-py3-none-any.whl.metadata (24 kB)
Downloading youtube_transcript_api-1.2.3-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.1/485.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.2.3


**Module 8: Final Deployment & Search Interface**

In [2]:
import gradio as gr
import pandas as pd
import numpy as np

from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity




In [3]:
API_KEY = "AIzaSyAUueJtpbgNovyq-7oQsFWJ64zWJPc2z2w"
youtube = build("youtube", "v3", developerKey="AIzaSyAUueJtpbgNovyq-7oQsFWJ64zWJPc2z2w")

model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
def get_videos(search_query, max_results=10):
    response = youtube.search().list(
        q=search_query,
        part="id,snippet",
        type="video",
        maxResults=max_results
    ).execute()

    video_ids = []
    titles = []

    for item in response["items"]:
        video_ids.append(item["id"]["videoId"])
        titles.append(item["snippet"]["title"])

    return video_ids, titles


In [5]:
def get_transcripts(video_ids):
    transcripts = []
    for vid in video_ids:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(vid)
            transcripts.append(" ".join([t["text"] for t in transcript]))
        except:
            transcripts.append("")
    return transcripts


In [6]:
def gradio_semantic_search(youtube_query, user_query, max_results):
    video_ids, titles = get_videos(youtube_query, max_results)
    transcripts = get_transcripts(video_ids)

    df = pd.DataFrame({
        "video_id": video_ids,
        "title": titles,
        "transcript": transcripts
    })

    df["combined_text"] = df["title"] + " " + df["transcript"]
    df["embedding"] = list(model.encode(df["combined_text"].tolist()))

    query_embedding = model.encode(user_query)
    scores = cosine_similarity(
        query_embedding.reshape(1, -1),
        np.vstack(df["embedding"].values)
    )[0]

    df["score"] = scores
    df = df.sort_values(by="score", ascending=False)

    # Format output for Colab (Markdown)
    output = ""
    for _, row in df.iterrows():
        video_id = row["video_id"]
        title = row["title"]
        score = row["score"]

        video_url = f"https://www.youtube.com/watch?v={video_id}"
        thumbnail = f"https://img.youtube.com/vi/{video_id}/0.jpg"

        output += f"""
### {title}
**Semantic Score:** {score:.3f}
[▶️ Watch on YouTube]({video_url})

![Thumbnail]({thumbnail})

---
"""
    return output


In [7]:
with gr.Blocks() as demo:
    gr.Markdown("# 🌐 Global YouTube Semantic Search (AI QueryTube)")

    yt_query = gr.Textbox(
        label="YouTube Search Query (keyword search)",
        placeholder="e.g. python tutorials"
    )

    user_query = gr.Textbox(
        label="Semantic Query (meaning-based)",
        placeholder="e.g. beginner python basics"
    )

    max_results = gr.Slider(
        minimum=4, maximum=15, value=10, step=1,
        label="Number of YouTube videos to analyze"
    )

    search_btn = gr.Button("Search")

    output = gr.Markdown()

    search_btn.click(
        fn=gradio_semantic_search,
        inputs=[yt_query, user_query, max_results],
        outputs=output
    )

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ef8a83223dfcf84f93.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


