# **MILESTONE 1: YouTube Data Collection and API Mastery (Weeks 1-2)**


In [1]:
# Cell 1: Install Required Libraries
!pip install -q requests pandas youtube-transcript-api sentence-transformers scikit-learn gradio

In [3]:
# Cell 2: Import Libraries
import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


In [None]:
# Cell 3: Set API Key and Channel ID
API_KEY = '________________________' 
CHANNEL_ID = "UCBkOVp1Cqz4MR0LYR8vKpZg"  # YouTube channel ID

print(f"Channel ID: {CHANNEL_ID}")

Channel ID: UCBkOVp1Cqz4MR0LYR8vKpZg


In [33]:
url = "https://www.googleapis.com/youtube/v3/search"
params = {
    "key":API_KEY,
    "channelId":CHANNEL_ID,
    "part": "snippet,id",
    "order":"date",
    "maxResults":50
}

response = requests.get(url, params=params).json()

videos_data = []
for item in response["items"]:
    if "videoId" in item["id"]:
        video_id = item["id"]["videoId"]
        title = item["snippet"]["title"]
        published = item["snippet"]["publishedAt"]
        videos_data.append([video_id, title, published])

df = pd.DataFrame(videos_data, columns=["video_id", "title", "published_date"])
df.head()

Unnamed: 0,video_id,title,published_date
0,WJ8iTQpNxZA,Spherical Coordinate System Explained | Electr...,2026-01-04T05:06:29Z
1,c_uhZ-yl-Rs,Cylindrical Coordinate System Explained | Cart...,2025-12-25T18:51:09Z
2,QiJYv4hHcLY,What is Virtual Ground in Op-Amp ? Why Virtual...,2025-12-09T17:52:45Z
3,Fk49H5xS_h0,Interesting thing about XOR and XNOR gates you...,2025-11-30T16:38:52Z
4,tWyPuCbYLUg,"Solved Problems on Vectors, Vector Algebra and...",2025-11-06T15:10:54Z


In [34]:
print(f"\n‚úÖ Successfully fetched {len(df)} videos!")
print("\nDataFrame Info:")
print(df.info())
print("\nFirst 5 videos:")
df.head()


‚úÖ Successfully fetched 48 videos!

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   video_id        48 non-null     object
 1   title           48 non-null     object
 2   published_date  48 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB
None

First 5 videos:


Unnamed: 0,video_id,title,published_date
0,WJ8iTQpNxZA,Spherical Coordinate System Explained | Electr...,2026-01-04T05:06:29Z
1,c_uhZ-yl-Rs,Cylindrical Coordinate System Explained | Cart...,2025-12-25T18:51:09Z
2,QiJYv4hHcLY,What is Virtual Ground in Op-Amp ? Why Virtual...,2025-12-09T17:52:45Z
3,Fk49H5xS_h0,Interesting thing about XOR and XNOR gates you...,2025-11-30T16:38:52Z
4,tWyPuCbYLUg,"Solved Problems on Vectors, Vector Algebra and...",2025-11-06T15:10:54Z


In [35]:
# Cell 6: Basic EDA - Data Overview
print("=" * 60)
print("EXPLORATORY DATA ANALYSIS")
print("=" * 60)

print(f"\nüìä Total Videos: {len(df)}")
print(f"üìä Date Range: {df['published_date'].min()} to {df['published_date'].max()}")
print(f"üìä Unique Titles: {df['title'].nunique()}")
print(f"üìä Missing Values:\n{df.isnull().sum()}")

EXPLORATORY DATA ANALYSIS

üìä Total Videos: 48
üìä Date Range: 2024-06-26T16:45:10Z to 2026-01-04T05:06:29Z
üìä Unique Titles: 48
üìä Missing Values:
video_id          0
title             0
published_date    0
dtype: int64


In [36]:
# Cell 7: Convert Date and Time Analysis
# Convert published_date to datetime
df['published_date'] = pd.to_datetime(df['published_date'])
df['year'] = df['published_date'].dt.year
df['month'] = df['published_date'].dt.month
df['day_of_week'] = df['published_date'].dt.day_name()

print("\nüìÖ Videos by Year:")
print(df['year'].value_counts().sort_index())



üìÖ Videos by Year:
year
2024    19
2025    28
2026     1
Name: count, dtype: int64


In [37]:
# Cell 8: Save Initial Dataset
df.to_csv("youtube_metadata.csv", index=False)
print("‚úÖ Metadata saved to 'youtube_metadata.csv'")

‚úÖ Metadata saved to 'youtube_metadata.csv'


# **MILESTONE 2: Transcript Extraction and Data Cleaning (Weeks 3-4)**

In [38]:
# Cell 9: Extract Transcripts
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

def get_transcript(video_id):
    """
    Fetch transcript for a YouTube video
    
    Args:
        video_id: YouTube video ID
    
    Returns:
        Transcript text or None if unavailable
    """
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = " ".join([entry["text"] for entry in transcript_list])
        return transcript_text
    except TranscriptsDisabled:
        return None
    except NoTranscriptFound:
        return None
    except Exception as e:
        return None

In [51]:
def fetch_transcripts_from_df(df, output_file='youtube_metadata_with_transcripts.csv'):
    """
    Fetch transcripts for all videos in the dataframe
    
    Args:
        df: DataFrame with video_id column
        output_file: Path to save the output CSV
    
    Returns:
        DataFrame with transcripts added
    """
    # Instantiate API object (Critical fix)
    api = YouTubeTranscriptApi()
    
    transcripts = []
    failed_ids = []
    
    print(f"Processing {len(df)} videos...")
    
    for i, row in df.iterrows():
        vid = row['video_id']
        title = row['title']
        
        print(f"[{i+1}/{len(df)}] Fetching: {vid} - {title[:40]}...")
        
        try:
            # Correct instance method usage
            transcript_obj = api.fetch(vid)
            
            if hasattr(transcript_obj, 'snippets'):
                full_text = " ".join([s.text for s in transcript_obj.snippets])
                full_text = full_text.replace("\n", " ").strip()
                transcripts.append(full_text)
                print("  ‚úì Success")
            else:
                print("  ‚ö†Ô∏è  No snippets found")
                transcripts.append(None)
                failed_ids.append(vid)
                
        except Exception as e:
            print(f"  ‚úó Failed: {e}")
            transcripts.append(None)
            failed_ids.append(vid)
            
        # Rate limiting to be safe
        time.sleep(0.5)
    
    # Add transcripts to dataframe
    df['transcript'] = transcripts
    
    # Save to CSV
    try:
        df.to_csv(output_file, index=False)
        print(f"\n‚úÖ Successfully saved to {output_file}")
    except PermissionError:
        print(f"\n‚ö†Ô∏è  PermissionError: Could not save to {output_file}. Is it open?")
        alt_file = f"youtube_metadata_final_{int(time.time())}.csv"
        df.to_csv(alt_file, index=False)
        print(f"‚úÖ Saved to {alt_file} instead.")
    
    # Print summary
    print(f"\n{'='*60}")
    print("TRANSCRIPT EXTRACTION SUMMARY")
    print(f"{'='*60}")
    print(f"üìä Total Videos: {len(df)}")
    print(f"‚úÖ Videos with transcripts: {df['transcript'].notna().sum()}")
    print(f"‚ùå Videos without transcripts: {df['transcript'].isna().sum()}")
    
    if failed_ids:
        print(f"\n‚ö†Ô∏è  Failed video IDs ({len(failed_ids)} total):")
        print(f"   {failed_ids[:10]}...")  # Show first 10
    
    return df

# Execute transcript fetching
df = fetch_transcripts_from_df(df)


Processing 48 videos...
[1/48] Fetching: WJ8iTQpNxZA - Spherical Coordinate System Explained | ...
  ‚úì Success
[2/48] Fetching: c_uhZ-yl-Rs - Cylindrical Coordinate System Explained ...
  ‚úì Success
[3/48] Fetching: QiJYv4hHcLY - What is Virtual Ground in Op-Amp ? Why V...
  ‚úì Success
[4/48] Fetching: Fk49H5xS_h0 - Interesting thing about XOR and XNOR gat...
  ‚úì Success
[5/48] Fetching: tWyPuCbYLUg - Solved Problems on Vectors, Vector Algeb...
  ‚úì Success
[6/48] Fetching: jGoVQ6tDE3M - Vector Algebra | Vector Addition and Sub...
  ‚úì Success
[7/48] Fetching: FsS-FUdS4J4 - Introduction to Electromagnetic Theory...
  ‚úì Success
[8/48] Fetching: 0LSH-xH5LN4 - What is PCB Vias ? Types of Vias in PCB....
  ‚úì Success
[9/48] Fetching: Gu9M1auKVsk - Different Operating Regions of BJT #alla...
  ‚úì Success
[10/48] Fetching: XeZtJwiF4Oc - PCB Explained | What is PCB ? Types of P...
  ‚úì Success
[11/48] Fetching: GHQ8vbOZMnI - What is Zero PCB ? Why it is used? How Z...
  ‚úì Succe

In [52]:
# Cell 11: Data Cleaning
import re

def clean_text(text):
    """Clean and normalize text"""
    if pd.isna(text):
        return ""
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?-]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Convert to lowercase
    text = text.lower()
    
    return text

# Apply cleaning
df['title_clean'] = df['title'].apply(clean_text)
df['transcript_clean'] = df['transcript'].apply(clean_text)

# Create combined text for embedding
df['combined_text'] = df['title_clean'] + " " + df['transcript_clean']

print("‚úÖ Text cleaning complete!")
df[['title', 'title_clean', 'transcript_clean']].head()


‚úÖ Text cleaning complete!


Unnamed: 0,title,title_clean,transcript_clean
0,Spherical Coordinate System Explained | Electr...,spherical coordinate system explained electrom...,"hey friends, welcome to the youtube channel al..."
1,Cylindrical Coordinate System Explained | Cart...,cylindrical coordinate system explained cartes...,"hey friends, welcome to the youtube channel al..."
2,What is Virtual Ground in Op-Amp ? Why Virtual...,what is virtual ground in op-amp ? why virtual...,"in this short video, let us understand what is..."
3,Interesting thing about XOR and XNOR gates you...,interesting thing about xor and xnor gates you...,"in this short video, let us learn one interest..."
4,"Solved Problems on Vectors, Vector Algebra and...","solved problems on vectors, vector algebra and...","music hey friends, welcome to the youtube chan..."


In [53]:
# Cell 12: Create Evaluation Queries for Electronics YouTube Channel
evaluation_queries = [
    # Basic Electronics Concepts (10)
    "basic electronics tutorial",
    "what is voltage and current",
    "ohms law explained",
    "resistor color code",
    "capacitor working principle",
    "inductor basics",
    "diode tutorial",
    "transistor fundamentals",
    "LED circuit design",
    "breadboard basics",
    
    # Circuit Theory (10)
    "series and parallel circuits",
    "kirchhoff voltage law",
    "kirchhoff current law",
    "thevenin theorem",
    "norton theorem",
    "superposition theorem",
    "ac and dc circuits",
    "impedance calculation",
    "resonance in circuits",
    "filter circuit design",
    
    # Components and Devices (10)
    "types of resistors",
    "ceramic capacitor vs electrolytic",
    "MOSFET vs BJT",
    "op amp applications",
    "555 timer circuit",
    "voltage regulator tutorial",
    "relay working principle",
    "transformer basics",
    "how batteries work",
    "semiconductor devices",
    
    # Digital Electronics (10)
    "logic gates tutorial",
    "boolean algebra basics",
    "flip flop circuits",
    "counter circuits",
    "shift register explained",
    "analog to digital converter",
    "multiplexer and demultiplexer",
    "encoder and decoder",
    "microcontroller basics",
    "Arduino tutorial",
    
    # Power Electronics (8)
    "power supply design",
    "rectifier circuit",
    "voltage regulator design",
    "SMPS working principle",
    "inverter circuit",
    "battery charging circuit",
    "solar panel circuit",
    "power factor correction",
    
    # Measurement and Tools (8)
    "multimeter tutorial",
    "oscilloscope basics",
    "function generator usage",
    "soldering techniques",
    "PCB design tutorial",
    "circuit debugging tips",
    "how to use breadboard",
    "component testing methods",
    
    # Communication Electronics (7)
    "amplifier circuit design",
    "radio frequency basics",
    "antenna design",
    "modulation techniques",
    "wireless communication",
    "signal processing",
    "transmission line theory",
    
    # Embedded Systems (7)
    "microprocessor vs microcontroller",
    "8051 microcontroller",
    "PIC microcontroller tutorial",
    "embedded C programming",
    "sensor interfacing",
    "motor control circuit",
    "IoT electronics projects",
    
    # Practical Projects (10)
    "DIY electronics projects",
    "home automation circuit",
    "security alarm system",
    "LED chaser circuit",
    "audio amplifier project",
    "temperature sensor circuit",
    "speed controller circuit",
    "battery level indicator",
    "clap switch circuit",
    "automatic light controller",
]

print(f"‚úÖ Created {len(evaluation_queries)} evaluation queries for Electronics channel")
print(f"\nüìã Query Categories:")
print("  ‚Ä¢ Basic Electronics Concepts: 10")
print("  ‚Ä¢ Circuit Theory: 10")
print("  ‚Ä¢ Components and Devices: 10")
print("  ‚Ä¢ Digital Electronics: 10")
print("  ‚Ä¢ Power Electronics: 8")
print("  ‚Ä¢ Measurement and Tools: 8")
print("  ‚Ä¢ Communication Electronics: 7")
print("  ‚Ä¢ Embedded Systems: 7")
print("  ‚Ä¢ Practical Projects: 10")

print("\nüîç Sample queries from each category:")
print("\nBasic Electronics:")
print(f"  - {evaluation_queries[0]}")
print(f"  - {evaluation_queries[2]}")
print("\nCircuit Theory:")
print(f"  - {evaluation_queries[10]}")
print(f"  - {evaluation_queries[12]}")
print("\nDigital Electronics:")
print(f"  - {evaluation_queries[30]}")
print(f"  - {evaluation_queries[32]}")
print("\nPractical Projects:")
print(f"  - {evaluation_queries[70]}")
print(f"  - {evaluation_queries[72]}")


‚úÖ Created 80 evaluation queries for Electronics channel

üìã Query Categories:
  ‚Ä¢ Basic Electronics Concepts: 10
  ‚Ä¢ Circuit Theory: 10
  ‚Ä¢ Components and Devices: 10
  ‚Ä¢ Digital Electronics: 10
  ‚Ä¢ Power Electronics: 8
  ‚Ä¢ Measurement and Tools: 8
  ‚Ä¢ Communication Electronics: 7
  ‚Ä¢ Embedded Systems: 7
  ‚Ä¢ Practical Projects: 10

üîç Sample queries from each category:

Basic Electronics:
  - basic electronics tutorial
  - ohms law explained

Circuit Theory:
  - series and parallel circuits
  - kirchhoff current law

Digital Electronics:
  - logic gates tutorial
  - flip flop circuits

Practical Projects:
  - DIY electronics projects
  - security alarm system


In [54]:
# Cell 13: Save Cleaned Dataset
df.to_csv("youtube_data_with_transcripts.csv", index=False)
print("‚úÖ Cleaned dataset saved to 'youtube_data_with_transcripts.csv'")

‚úÖ Cleaned dataset saved to 'youtube_data_with_transcripts.csv'


# **MILESTONE 3: Sentence Transformer Evaluation (Weeks 5-6)**

In [49]:
# Cell 14: Load Sentence Transformer Models
from sentence_transformers import SentenceTransformer, util

# Load three models for comparison
models = {
    'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
    'paraphrase-MiniLM-L12-v2': SentenceTransformer('paraphrase-MiniLM-L12-v2'),
    'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2')
}

print("‚úÖ Models loaded successfully!")
for model_name in models.keys():
    print(f"  - {model_name}")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Models loaded successfully!
  - all-MiniLM-L6-v2
  - paraphrase-MiniLM-L12-v2
  - all-mpnet-base-v2


In [55]:
# Cell 15: Generate Embeddings for Videos
# Filter out videos without transcripts for better results
df_with_transcripts = df[df['transcript'].notna()].copy()

print(f"Generating embeddings for {len(df_with_transcripts)} videos...")

embeddings_dict = {}

for model_name, model in models.items():
    print(f"\nProcessing with {model_name}...")
    
    # Embed combined text
    embeddings = model.encode(
        df_with_transcripts['combined_text'].tolist(),
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    embeddings_dict[model_name] = embeddings
    print(f"  Shape: {embeddings.shape}")

print("\n‚úÖ Embeddings generated for all models!")


Generating embeddings for 31 videos...

Processing with all-MiniLM-L6-v2...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Shape: (31, 384)

Processing with paraphrase-MiniLM-L12-v2...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Shape: (31, 384)

Processing with all-mpnet-base-v2...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Shape: (31, 768)

‚úÖ Embeddings generated for all models!


In [56]:
# Cell 16: Similarity Metrics Comparison
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

def compute_similarities(query_embedding, corpus_embeddings, metric='cosine'):
    """
    Compute similarity between query and corpus
    
    Args:
        query_embedding: Query vector
        corpus_embeddings: Matrix of corpus vectors
        metric: 'cosine', 'euclidean', or 'manhattan'
    
    Returns:
        Array of similarity scores
    """
    query_embedding = query_embedding.reshape(1, -1)
    
    if metric == 'cosine':
        scores = cosine_similarity(query_embedding, corpus_embeddings)[0]
        # Higher is better
        return scores
    elif metric == 'euclidean':
        scores = euclidean_distances(query_embedding, corpus_embeddings)[0]
        # Lower is better, so negate
        return -scores
    elif metric == 'manhattan':
        scores = manhattan_distances(query_embedding, corpus_embeddings)[0]
        # Lower is better, so negate
        return -scores
    else:
        raise ValueError(f"Unknown metric: {metric}")


In [57]:
# Cell 17: Evaluate Models and Metrics
def evaluate_search(model, model_name, embeddings, queries, df, top_k=5):
    """
    Evaluate search performance
    """
    metrics_list = ['cosine', 'euclidean', 'manhattan']
    results = {}
    
    for metric in metrics_list:
        print(f"\n{model_name} - {metric}:")
        
        # Encode queries
        query_embeddings = model.encode(queries, convert_to_numpy=True)
        
        # For each query, find top matches
        for i, query in enumerate(queries[:5]):  # Show first 5 queries
            query_emb = query_embeddings[i]
            
            # Compute similarities
            scores = compute_similarities(query_emb, embeddings, metric)
            
            # Get top k indices
            top_indices = np.argsort(scores)[-top_k:][::-1]
            
            print(f"\nQuery: '{query}'")
            print("Top matches:")
            for idx in top_indices:
                video_title = df.iloc[idx]['title'][:60]
                print(f"  - {video_title}... (score: {scores[idx]:.4f})")
    
    return results


In [58]:
# Cell 18: Run Evaluation
# Use a subset of evaluation queries for testing
test_queries = evaluation_queries[:10]

for model_name, model in models.items():
    print("\n" + "=" * 70)
    print(f"EVALUATING: {model_name}")
    print("=" * 70)
    
    evaluate_search(
        model, 
        model_name, 
        embeddings_dict[model_name], 
        test_queries, 
        df_with_transcripts,
        top_k=3
    )



EVALUATING: all-MiniLM-L6-v2

all-MiniLM-L6-v2 - cosine:

Query: 'basic electronics tutorial'
Top matches:
  - Different Operating Regions of BJT #allaboutelectronics #bjt... (score: 0.2805)
  - Vector Algebra | Vector Addition and Subtraction | Concept o... (score: 0.2786)
  - Solved Problems on Vectors, Vector Algebra and Vector Fields... (score: 0.2624)

Query: 'what is voltage and current'
Top matches:
  - Vector Algebra | Vector Addition and Subtraction | Concept o... (score: 0.2874)
  - Zener Diode Solved Problems | Line and Load Regulation in Ze... (score: 0.2586)
  - 4 to 20 mA Standard Explained | Advantages of 4 to 20 mA Sta... (score: 0.2323)

Query: 'ohms law explained'
Top matches:
  - BiCMOS Logic Gates Explained | BiCMOS NAND gate and NOR gate... (score: 0.2650)
  - ECL Logic Explained | ECL OR and NOR gate explained... (score: 0.2634)
  - Different Operating Regions of BJT #allaboutelectronics #bjt... (score: 0.2096)

Query: 'resistor color code'
Top matches:
  - Serie

In [59]:
# Cell 19: Select Best Model and Create Final Index
# Based on evaluation, select the best model
BEST_MODEL_NAME = 'all-MiniLM-L6-v2'  # Adjust based on your evaluation
BEST_MODEL = models[BEST_MODEL_NAME]
BEST_EMBEDDINGS = embeddings_dict[BEST_MODEL_NAME]

print(f"Selected model: {BEST_MODEL_NAME}")
print(f"Embedding shape: {BEST_EMBEDDINGS.shape}")

# Add embeddings to dataframe
embedding_columns = [f'emb_{i}' for i in range(BEST_EMBEDDINGS.shape[1])]
embeddings_df = pd.DataFrame(BEST_EMBEDDINGS, columns=embedding_columns)

# Combine with original dataframe
df_final = pd.concat([df_with_transcripts.reset_index(drop=True), embeddings_df], axis=1)

print(f"\n‚úÖ Final dataframe shape: {df_final.shape}")
df_final.head()


Selected model: all-MiniLM-L6-v2
Embedding shape: (31, 384)

‚úÖ Final dataframe shape: (31, 394)


Unnamed: 0,video_id,title,published_date,year,month,day_of_week,transcript,title_clean,transcript_clean,combined_text,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,WJ8iTQpNxZA,Spherical Coordinate System Explained | Electr...,2026-01-04 05:06:29+00:00,2026,1,Sunday,"Hey friends, welcome to the YouTube channel AL...",spherical coordinate system explained electrom...,"hey friends, welcome to the youtube channel al...",spherical coordinate system explained electrom...,...,0.014898,-0.035594,0.079857,-0.028819,-0.028986,0.046605,-0.078655,0.086435,-0.018098,-0.030392
1,c_uhZ-yl-Rs,Cylindrical Coordinate System Explained | Cart...,2025-12-25 18:51:09+00:00,2025,12,Thursday,"Hey friends, welcome to the YouTube channel AL...",cylindrical coordinate system explained cartes...,"hey friends, welcome to the youtube channel al...",cylindrical coordinate system explained cartes...,...,-0.024495,0.007653,0.076006,-0.033852,0.043374,-0.015205,-0.055626,0.098105,-0.031456,-0.054638
2,QiJYv4hHcLY,What is Virtual Ground in Op-Amp ? Why Virtual...,2025-12-09 17:52:45+00:00,2025,12,Tuesday,"In this short video, let us understand what¬† i...",what is virtual ground in op-amp ? why virtual...,"in this short video, let us understand what is...",what is virtual ground in op-amp ? why virtual...,...,-0.050358,-0.009683,-0.014643,-0.015731,0.028828,0.035623,-0.050781,0.010314,0.063209,0.015653
3,Fk49H5xS_h0,Interesting thing about XOR and XNOR gates you...,2025-11-30 16:38:52+00:00,2025,11,Sunday,"In this short video, let us learn one interest...",interesting thing about xor and xnor gates you...,"in this short video, let us learn one interest...",interesting thing about xor and xnor gates you...,...,0.010095,-0.002567,-0.060339,0.021322,-0.054585,0.105726,0.045492,0.080638,0.002986,-0.03303
4,tWyPuCbYLUg,"Solved Problems on Vectors, Vector Algebra and...",2025-11-06 15:10:54+00:00,2025,11,Thursday,"[music] Hey friends, welcome to the YouTube ch...","solved problems on vectors, vector algebra and...","music hey friends, welcome to the youtube chan...","solved problems on vectors, vector algebra and...",...,-0.028506,0.019802,0.028519,-0.031301,0.01255,0.003249,-0.075965,0.057644,-0.068243,0.00566


In [63]:
# Cell 20: Save Final Index
# Save as CSV instead of parquet to avoid dependency issues
df_final.to_csv("youtube_video_index.csv", index=False)
print("‚úÖ Video index saved to 'youtube_video_index.csv'")

# Also save as CSV (without embeddings) for reference
df_final[['video_id', 'title', 'published_date', 'transcript']].to_csv(
    "youtube_video_metadata_final.csv", 
    index=False
)
print("‚úÖ Metadata saved to 'youtube_video_metadata_final.csv'")

‚úÖ Video index saved to 'youtube_video_index.csv'
‚úÖ Metadata saved to 'youtube_video_metadata_final.csv'


# **MILESTONE 4: Semantic Search Implementation (Weeks 7-8)**

In [64]:
# Cell 21: Load Saved Index and Model
# For a fresh session, load the saved data
df_index = pd.read_csv("youtube_video_index.csv")
search_model = SentenceTransformer(BEST_MODEL_NAME)

# Extract embeddings from dataframe
embedding_cols = [col for col in df_index.columns if col.startswith('emb_')]
corpus_embeddings = df_index[embedding_cols].values

print(f"‚úÖ Loaded index with {len(df_index)} videos")
print(f"‚úÖ Embedding dimension: {corpus_embeddings.shape[1]}")

‚úÖ Loaded index with 31 videos
‚úÖ Embedding dimension: 384


In [80]:
# Cell 22: Create Search Function
def search_videos(query, model, corpus_embeddings, df, top_k=5, metric='cosine', threshold=None):
    """
    Search for videos based on semantic similarity
    
    Args:
        query: Search query string
        model: SentenceTransformer model
        corpus_embeddings: Precomputed video embeddings
        df: DataFrame with video metadata
        top_k: Number of results to return
        metric: Similarity metric ('cosine', 'euclidean', 'manhattan')
        threshold: Minimum similarity threshold (only applied for cosine)
    
    Returns:
        DataFrame with top matching videos
    """
    # Encode query
    query_embedding = model.encode([query], convert_to_numpy=True)[0]
    
    # Compute similarities
    scores = compute_similarities(query_embedding, corpus_embeddings, metric)
    
    # Apply threshold only for cosine similarity
    if threshold is not None and metric == 'cosine':
        mask = scores >= threshold
        filtered_indices = np.where(mask)[0]
        filtered_scores = scores[mask]
        
        if len(filtered_indices) == 0:
            print("No results above threshold")
            return pd.DataFrame()
    else:
        # For euclidean and manhattan, don't use threshold filtering
        filtered_indices = np.arange(len(scores))
        filtered_scores = scores
    
    # Sort and get top k
    sorted_idx = np.argsort(filtered_scores)[-top_k:][::-1]
    top_indices = filtered_indices[sorted_idx]
    top_scores = filtered_scores[sorted_idx]
    
    # Create results dataframe
    results = df.iloc[top_indices][['video_id', 'title', 'published_date']].copy()
    results['similarity_score'] = top_scores
    results['rank'] = range(1, len(results) + 1)
    
    return results.reset_index(drop=True)

In [81]:
# Cell 23: Test Search Function
test_query = "How Transistors work"

print(f"üîç Searching for: '{test_query}'\n")
results = search_videos(
    test_query, 
    search_model, 
    corpus_embeddings, 
    df_index,
    top_k=5,
    metric='cosine'
)

print(results[['rank', 'title', 'similarity_score']])


üîç Searching for: 'How Transistors work'

   rank                                              title  similarity_score
0     1  What is Diode Connected Transistor? #allaboute...          0.487581
1     2  ECL Logic Explained | ECL OR and NOR gate expl...          0.456276
2     3        TTL Logic: TTL NAND and NOR gates Explained          0.410409
3     4  BiCMOS Logic Gates Explained | BiCMOS NAND gat...          0.395695
4     5  TTL Logic Explained | TTL Inverter Circuit | N...          0.359692


In [82]:
# Cell 24: Compare Different Metrics
def compare_metrics(query, model, embeddings, df, top_k=5):
    """Compare results across different metrics"""
    metrics = ['cosine', 'euclidean', 'manhattan']
    
    print(f"üîç Query: '{query}'\n")
    
    for metric in metrics:
        print(f"\n{'='*70}")
        print(f"Metric: {metric.upper()}")
        print('='*70)
        
        results = search_videos(query, model, embeddings, df, top_k, metric)
        
        for _, row in results.iterrows():
            print(f"{row['rank']}. {row['title'][:70]}")
            print(f"   Score: {row['similarity_score']:.4f}\n")
# Test comparison
compare_metrics("transistor basics tutorial", search_model, corpus_embeddings, df_index, top_k=3)

üîç Query: 'transistor basics tutorial'


Metric: COSINE
1. What is Diode Connected Transistor? #allaboutelectronics
   Score: 0.4484

2. BiCMOS Logic Gates Explained | BiCMOS NAND gate and NOR gates
   Score: 0.3777

3. TTL Logic: TTL NAND and NOR gates Explained
   Score: 0.3445


Metric: EUCLIDEAN
1. What is Diode Connected Transistor? #allaboutelectronics
   Score: -1.0503

2. BiCMOS Logic Gates Explained | BiCMOS NAND gate and NOR gates
   Score: -1.1156

3. TTL Logic: TTL NAND and NOR gates Explained
   Score: -1.1450


Metric: MANHATTAN
1. What is Diode Connected Transistor? #allaboutelectronics
   Score: -16.3069

2. BiCMOS Logic Gates Explained | BiCMOS NAND gate and NOR gates
   Score: -17.6623

3. TTL Logic: TTL NAND and NOR gates Explained
   Score: -17.7227



In [83]:
# Cell 25: Optimize Threshold
def find_optimal_threshold(queries, model, embeddings, df, metric='cosine'):
    """
    Find optimal threshold by testing multiple values
    Note: Threshold optimization only works for cosine similarity
    """
    if metric != 'cosine':
        print(f"‚ö†Ô∏è  Threshold optimization only supported for cosine similarity")
        print(f"   Using metric '{metric}' without threshold filtering")
        return None, None
    
    thresholds = np.arange(0.0, 0.5, 0.05)
    results_counts = []
    
    print(f"Testing thresholds for {metric} similarity...\n")
    
    for threshold in thresholds:
        total_results = 0
        for query in queries:
            results = search_videos(query, model, embeddings, df, top_k=5, 
                                   metric=metric, threshold=threshold)
            total_results += len(results)
        
        avg_results = total_results / len(queries)
        results_counts.append(avg_results)
        print(f"Threshold: {threshold:.2f} - Avg results per query: {avg_results:.1f}")
    
    # Find optimal threshold (one that returns ~3-5 results on average)
    optimal_idx = min(range(len(results_counts)), 
                     key=lambda i: abs(results_counts[i] - 4.0))
    optimal_threshold = thresholds[optimal_idx]
    
    print(f"\n‚úÖ Recommended threshold: {optimal_threshold:.2f}")
    print(f"   Average results: {results_counts[optimal_idx]:.1f}")
    
    return thresholds, results_counts

# Test with sample queries
print("Finding optimal threshold for cosine similarity...")
thresholds, counts = find_optimal_threshold(
    test_queries[:5], 
    search_model, 
    corpus_embeddings, 
    df_index,
    metric='cosine'  # Only use cosine for threshold optimization
)

Finding optimal threshold for cosine similarity...
Testing thresholds for cosine similarity...

Threshold: 0.00 - Avg results per query: 5.0
Threshold: 0.05 - Avg results per query: 5.0
Threshold: 0.10 - Avg results per query: 4.6
Threshold: 0.15 - Avg results per query: 3.6
No results above threshold
Threshold: 0.20 - Avg results per query: 2.8
No results above threshold
Threshold: 0.25 - Avg results per query: 2.0
No results above threshold
No results above threshold
No results above threshold
No results above threshold
No results above threshold
Threshold: 0.30 - Avg results per query: 0.0
No results above threshold
No results above threshold
No results above threshold
No results above threshold
No results above threshold
Threshold: 0.35 - Avg results per query: 0.0
No results above threshold
No results above threshold
No results above threshold
No results above threshold
No results above threshold
Threshold: 0.40 - Avg results per query: 0.0
No results above threshold
No results ab

In [84]:
# Cell 26: Create Gradio Interface
import gradio as gr

def gradio_search(query, top_k=5, metric='cosine', threshold=0.0):
    """
    Gradio-compatible search function
    """
    if not query.strip():
        return "Please enter a search query."
    
    results = search_videos(
        query, 
        search_model, 
        corpus_embeddings, 
        df_index,
        top_k=int(top_k),
        metric=metric,
        threshold=float(threshold)
    )
    
    if len(results) == 0:
        return "No results found. Try lowering the threshold or using a different query."
    
    # Format results with embedded videos
    output = f"## üîç Search Results for: '{query}'\n\n"
    
    for _, row in results.iterrows():
        video_id = row['video_id']
        title = row['title']
        score = row['similarity_score']
        date = row['published_date']
        
        output += f"### {row['rank']}. {title}\n"
        output += f"**Similarity Score:** {score:.4f} | **Published:** {str(date)[:10]}\n\n"
        
        # Embed YouTube video
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        output += f"[![Watch on YouTube](https://img.youtube.com/vi/{video_id}/0.jpg)]({video_url})\n\n"
        output += f"[üé¨ Watch on YouTube]({video_url})\n\n"
        output += "---\n\n"
    
    return output


In [85]:

# Cell 27: Launch Gradio Interface
interface = gr.Interface(
    fn=gradio_search,
    inputs=[
        gr.Textbox(
            label="Search Query", 
            placeholder="Enter your search query here...",
            lines=2
        ),
        gr.Slider(
            minimum=1, 
            maximum=10, 
            value=5, 
            step=1, 
            label="Number of Results"
        ),
        gr.Radio(
            choices=['cosine', 'euclidean', 'manhattan'], 
            value='cosine', 
            label="Similarity Metric"
        ),
        gr.Slider(
            minimum=0.0, 
            maximum=0.5, 
            value=0.0, 
            step=0.05, 
            label="Similarity Threshold"
        )
    ],
    outputs=gr.Markdown(label="Search Results"),
    title="üé• QueryTube AI - Semantic Video Search",
    description="""
    Search through YouTube videos using natural language queries. 
    The system uses AI-powered semantic search to find the most relevant videos.
    """,
    examples=[
        ["python programming tutorial", 5, "cosine", 0.0],
        ["machine learning basics", 3, "cosine", 0.1],
        ["web development guide", 5, "euclidean", 0.0],
    ],
    theme=gr.themes.Soft()
)

# Launch the interface
interface.launch(share=True, debug=True)

print("\n‚úÖ Gradio interface launched!")
print("üåê Access your search engine through the provided URL")


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://d3edfd16d8c4e0e77d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Created dataset file at: .gradio\flagged\dataset1.csv
No results above threshold
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://d3edfd16d8c4e0e77d.gradio.live

‚úÖ Gradio interface launched!
üåê Access your search engine through the provided URL


In [86]:
# Cell 28: Export Final Summary
print("\n" + "="*70)
print("PROJECT SUMMARY")
print("="*70)

summary = f"""
üìä QUERYTUBE AI - SEMANTIC SEARCH ENGINE

Dataset Statistics:
- Total Videos Collected: {len(df)}
- Videos with Transcripts: {len(df_with_transcripts)}
- Embedding Dimension: {corpus_embeddings.shape[1]}

Model Configuration:
- Selected Model: {BEST_MODEL_NAME}
- Similarity Metrics: Cosine, Euclidean, Manhattan
- Default Top-K: 5

Files Generated:
‚úì youtube_metadata.csv
‚úì youtube_data_with_transcripts.csv
‚úì youtube_video_index.parquet
‚úì youtube_video_metadata_final.csv

Features:
‚úì Semantic search using transformer embeddings
‚úì Multiple similarity metrics
‚úì Adjustable threshold filtering
‚úì Interactive Gradio interface
‚úì Embedded video previews

Next Steps:
- Fine-tune threshold values
- Add more evaluation queries
- Implement query expansion
- Add filters (date, duration, etc.)
"""

print(summary)

# Save summary
with open("project_summary.txt", "w") as f:
    f.write(summary)

print("\n‚úÖ Project complete! Summary saved to 'project_summary.txt'")


PROJECT SUMMARY

üìä QUERYTUBE AI - SEMANTIC SEARCH ENGINE

Dataset Statistics:
- Total Videos Collected: 48
- Videos with Transcripts: 31
- Embedding Dimension: 384

Model Configuration:
- Selected Model: all-MiniLM-L6-v2
- Similarity Metrics: Cosine, Euclidean, Manhattan
- Default Top-K: 5

Files Generated:
‚úì youtube_metadata.csv
‚úì youtube_data_with_transcripts.csv
‚úì youtube_video_index.parquet
‚úì youtube_video_metadata_final.csv

Features:
‚úì Semantic search using transformer embeddings
‚úì Multiple similarity metrics
‚úì Adjustable threshold filtering
‚úì Interactive Gradio interface
‚úì Embedded video previews

Next Steps:
- Fine-tune threshold values
- Add more evaluation queries
- Implement query expansion
- Add filters (date, duration, etc.)


‚úÖ Project complete! Summary saved to 'project_summary.txt'
