# Thamanya AI Discovery - Data Pipeline (Zero-Dollar)

This notebook handles:
1.  Downloading audio from YouTube video URL.
2.  Transcribing using OpenAI Whisper (Free GPU on Colab).
3.  Chunking text into meaningful segments (300-500 chars).
4.  Generating Embeddings using Gemini API.
5.  Upserting data to Supabase (pgvector).

In [None]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q yt-dlp
!pip install -q google-generativeai
!pip install -q supabase
!pip install -q pandas

In [None]:
import whisper
import yt_dlp
import google.generativeai as genai
from supabase import create_client, Client
from google.colab import userdata
import os

# --- CONFIGURATION ---
# Make sure to set these secrets in Colab (Keys -> Add new)
# SUPABASE_URL, SUPABASE_KEY, GEMINI_API_KEY

try:
    SUPABASE_URL = userdata.get('SUPABASE_URL')
    SUPABASE_KEY = userdata.get('SUPABASE_KEY')
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
except:
    print("Secrets not found. Please set SUPABASE_URL, SUPABASE_KEY, GEMINI_API_KEY in Colab secrets.")
    # Placeholder for local testing if needed
    SUPABASE_URL = ""
    SUPABASE_KEY = ""
    GEMINI_API_KEY = ""

genai.configure(api_key=GEMINI_API_KEY)
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

print("Libraries loaded & Configured")

## 1. Download Audio from YouTube

In [None]:
def download_audio(url, output_filename="audio.mp3"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': output_filename.replace('.mp3', ''),
        'quiet': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        title = info.get('title', 'Unknown Title')
        print(f"Downloaded: {title}")
        return title

# Example Usage
VIDEO_URL = "https://www.youtube.com/watch?v=VIDEO_ID_HERE" # REPLACE ME
EPISODE_TITLE = download_audio(VIDEO_URL, "episode.mp3")

## 2. Transcribe with Whisper

In [None]:
model = whisper.load_model("large") # Use 'medium' if 'large' is too slow or OOM
result = model.transcribe("episode.mp3")
segments = result['segments']
print(f"Transcribed {len(segments)} segments.")

## 3. Chunking & Embedding

In [None]:
def chunk_segments(segments, max_chars=500):
    chunks = []
    current_chunk = ""
    start_time = 0
    
    for seg in segments:
        text = seg['text'].strip()
        if not text: continue
        
        if not current_chunk:
            start_time = seg['start']
            
        if len(current_chunk) + len(text) + 1 <= max_chars:
            current_chunk += " " + text
            end_time = seg['end']
        else:
            # Push chunk
            chunks.append({
                'content': current_chunk.strip(),
                'start_time': start_time,
                'end_time': end_time
            })
            # Start new chunk
            current_chunk = text
            start_time = seg['start']
            end_time = seg['end']
            
    # Last chunk
    if current_chunk:
        chunks.append({
            'content': current_chunk.strip(),
            'start_time': start_time,
            'end_time': end_time
        })
    return chunks

chunks = chunk_segments(segments)
print(f"Created {len(chunks)} chunks.")

def get_embedding(text):
    # text-embedding-004 is recommended for lower cost/high perf
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=text,
        task_type="retrieval_document",
        title="Podcast Segment"
    )
    return result['embedding']

# Process chunks (Batch this in production to avoid rate limits, but for POC sequential is fine or small batches)
import time
processed_chunks = []
for i, chunk in enumerate(chunks):
    chunk['embedding'] = get_embedding(chunk['content'])
    processed_chunks.append(chunk)
    if i % 10 == 0: 
        print(f"Encoded {i}/{len(chunks)}")
        time.sleep(0.5) # simple rate limit handling

## 4. Upsert to Supabase

In [None]:
# First, insert episode if not exists
episode_data = {
    'title': EPISODE_TITLE,
    'url': VIDEO_URL,
    'published_at': 'now()', # Or extract from metadata
    'metadata': {}
}

res = supabase.table('episodes').insert(episode_data).execute()
episode_id = res.data[0]['id']
print(f"Inserted Episode ID: {episode_id}")

# Prepare chunk data
db_rows = []
for chunk in processed_chunks:
    db_rows.append({
        'episode_id': episode_id,
        'content': chunk['content'],
        'start_time': chunk['start_time'],
        'end_time': chunk['end_time'],
        'embedding': chunk['embedding']
    })

# Bulk insert
try:
    res = supabase.table('chunks').insert(db_rows).execute()
    print("Inserted chunks successfully!")
except Exception as e:
    print(f"Error inserting chunks: {e}")