# AI Query Tube

## 1. Setup and Imports

In [None]:
%pip install youtube-transcript-api sentence-transformers tf-keras

import os
import sys
import time
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## 2. YouTube Data Collection

In [None]:
API_KEY = os.environ.get('YOUTUBE_API_KEY')
if API_KEY is None:
    raise RuntimeError('Set YOUTUBE_API_KEY in the environment before running this notebook')
CHANNEL_ID = "UC4SVo0Ue36XCfOyb5Lh1viQ"

In [None]:
url = "https://www.googleapis.com/youtube/v3/search"
params = {
    "key": API_KEY,
    "channelId": CHANNEL_ID,
    "part": "snippet,id",
    "order": "date",
    "maxResults": 50,
    "hl": "en",
    "regionCode": "US"
}

def extract_video_fields(item):
    video_id = item["id"]["videoId"]
    title = item["snippet"]["title"]
    published = item["snippet"]["publishedAt"]
    return video_id, title, published

videos = []
next_page_token = None

# Set a limit of pages to fetch to avoid infinite loops if something goes wrong, or remove for full channel
max_pages = 10
page_count = 0

while True:
    if next_page_token:
        params['pageToken'] = next_page_token

    try:
        resp = requests.get(url, params=params, timeout=10)
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print('Request failed:', e)
        break

    for item in data.get("items", []):
        if "videoId" in item.get("id", {}):
            video_id, title, published = extract_video_fields(item)
            videos.append([video_id, title, published])

    next_page_token = data.get("nextPageToken")
    page_count += 1
    if not next_page_token or page_count >= max_pages:
        break

df = pd.DataFrame(videos, columns=["video_id", "title", "published_date"])
df.to_csv("youtube_metadata.csv", index=False)
display(df.head())

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Basic info
display(df.info())
display(df.describe())

# Missing values
print("Missing values:")
print(df.isnull().sum())

# Publish date distribution
df['published_date'] = pd.to_datetime(df['published_date'])
plt.figure(figsize=(10, 6))
df['published_date'].dt.year.value_counts().sort_index().plot(kind='bar')
plt.title('Video Distribution by Year')
plt.xlabel('Year')
plt.ylabel('Number of Videos')
plt.show()

## 4. Transcript Fetching

In [None]:
# Using logic from reference notebook: week3_transcripts.ipynb
ytt_api = YouTubeTranscriptApi()
transcripts = []
DELAY = 1.0

for vid in df["video_id"]:
    try:
        # Reference implementation uses .fetch(video_id)
        transcript_object = ytt_api.fetch(vid)
        
        # Reference implementation joins snippets
        text = " ".join(
            snippet.text for snippet in transcript_object.snippets
        )
        transcripts.append(text)
        print(f"✅ Success: {vid}")

    except Exception as e:
        print(f"❌ Failed: {vid} | {type(e).__name__}: {e}")
        transcripts.append(None)
    
    time.sleep(DELAY)

df["transcript"] = transcripts
display(df.head())

## 5. Embeddings and Search

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

df["text_for_embedding"] = df["title"] + " " + df["transcript"].fillna("")
df["embedding"] = df["text_for_embedding"].apply(lambda x: model.encode(x))

In [None]:
query = "PHP tutorials"
query_embedding = model.encode(query)

scores = cosine_similarity([query_embedding], list(df["embedding"]))
top_idx = scores[0].argsort()[-5:][::-1]  # Top 5 results

results = df.iloc[top_idx][["title", "video_id"]].copy()
results["score"] = scores[0][top_idx]
display(results)