# Embed Academic Library with GTE-Qwen2-1.5B

This notebook embeds your 1,260 library texts + 140 close readings using a 32K context model.

**Before running:**
1. Go to Runtime → Change runtime type → Select **A100 GPU** (or T4 if unavailable)
2. Run the SQL in `add_vector_search.sql` in your Supabase SQL Editor first

In [None]:
# Cell 1: Install dependencies + clear stale model cache
!pip install -q "transformers>=4.44,<5" sentence-transformers torch
!pip install -q supabase==2.11.0 postgrest==0.19.0
!rm -rf ~/.cache/huggingface/modules/transformers_modules/Alibaba*
!rm -rf ~/.cache/huggingface/hub/models--Alibaba-NLP--gte-Qwen2-1.5B-instruct

In [None]:
# Cell 2: Configuration
SUPABASE_URL = "https://zknmvifnbrycjwckkggy.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Inprbm12aWZuYnJ5Y2p3Y2trZ2d5Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc2OTcwMjM3OSwiZXhwIjoyMDg1Mjc4Mzc5fQ.NbKHG7-VKYSdCbvhSW9a1v-5OoLSVevyEKin_RI4pvQ"

MODEL_NAME = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
MAX_TOKENS = 32000  # Leave some buffer from 32768
EMBEDDING_DIM = 1536

In [None]:
# Cell 3: Initialize Supabase client + fix vector column dimensions
import requests as http_requests
from supabase import create_client

supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
print("Connected to Supabase")

# The embedding column was created for 768 dims but GTE-Qwen2-1.5B outputs 1536.
# Alter both tables to use the correct dimension.
MIGRATION_SQL = """
ALTER TABLE library_texts
  ALTER COLUMN embedding TYPE vector(1536)
  USING embedding::vector(1536);

ALTER TABLE close_readings
  ALTER COLUMN embedding TYPE vector(1536)
  USING embedding::vector(1536);
"""

# Also need to recreate the search functions with the new dimension
FUNCTION_SQL = """
CREATE OR REPLACE FUNCTION library_semantic_search(
  query_embedding vector(1536),
  match_count int DEFAULT 10
)
RETURNS TABLE (
  id uuid,
  file_name text,
  author text,
  title text,
  year int,
  category text,
  similarity float
)
LANGUAGE plpgsql
AS $$
BEGIN
  RETURN QUERY
  SELECT
    lt.id,
    lt.file_name,
    lt.author,
    lt.title,
    lt.year,
    lt.category,
    1 - (lt.embedding <=> query_embedding) AS similarity
  FROM library_texts lt
  WHERE lt.embedding IS NOT NULL
  ORDER BY lt.embedding <=> query_embedding
  LIMIT match_count;
END;
$$;

CREATE OR REPLACE FUNCTION readings_semantic_search(
  query_embedding vector(1536),
  match_count int DEFAULT 10
)
RETURNS TABLE (
  id uuid,
  file_name text,
  source_author text,
  source_title text,
  similarity float
)
LANGUAGE plpgsql
AS $$
BEGIN
  RETURN QUERY
  SELECT
    cr.id,
    cr.file_name,
    cr.source_author,
    cr.source_title,
    1 - (cr.embedding <=> query_embedding) AS similarity
  FROM close_readings cr
  WHERE cr.embedding IS NOT NULL
  ORDER BY cr.embedding <=> query_embedding
  LIMIT match_count;
END;
$$;
"""

print("Migrating vector columns from 768 → 1536 dimensions...")
for sql in [MIGRATION_SQL, FUNCTION_SQL]:
    resp = http_requests.post(
        f"{SUPABASE_URL}/rest/v1/rpc/exec_sql",
        headers={
            "apikey": SUPABASE_KEY,
            "Authorization": f"Bearer {SUPABASE_KEY}",
            "Content-Type": "application/json",
        },
        json={"query": sql},
        timeout=30,
    )
    if resp.status_code in (200, 204):
        print("  OK")
    elif "does not exist" in resp.text:
        # exec_sql RPC doesn't exist — print SQL for manual execution
        print("\n⚠ Cannot run SQL via API. Paste this into Supabase SQL Editor:\n")
        print(sql)
        print("=" * 60)
    else:
        # May already be 1536 — check the error
        if "1536" in resp.text:
            print("  Already at 1536 dimensions.")
        else:
            print(f"  Response: {resp.status_code} {resp.text[:200]}")
            print("\n⚠ If this failed, paste the SQL below into Supabase SQL Editor:\n")
            print(sql)

In [None]:
# Cell 4: Load the embedding model using sentence-transformers
import torch
from sentence_transformers import SentenceTransformer

print(f"Loading {MODEL_NAME}...")
print("This may take a few minutes to download (~3GB)...")

# Qwen2 is natively supported in transformers 4.40+ — no custom code needed
model = SentenceTransformer(MODEL_NAME, trust_remote_code=False)
model.max_seq_length = MAX_TOKENS

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"Model loaded on {device}")
print(f"Max sequence length: {model.max_seq_length}")

In [None]:
# Cell 5: Embedding function with mean-pooling for long texts
import numpy as np
import gc
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Use a conservative chunk size that fits in A100 VRAM alongside the model
# 8K tokens * 4 chars/token = 32K chars per chunk
CHUNK_TOKENS = 8000
CHUNK_CHARS = CHUNK_TOKENS * 4
OVERLAP_CHARS = 1000

def get_embedding(text: str) -> list:
    """Embed text, chunking and mean-pooling for long texts."""

    approx_tokens = len(text) // 4

    if approx_tokens <= CHUNK_TOKENS:
        embedding = model.encode(text, normalize_embeddings=True)
        return embedding.tolist()

    # Split into manageable chunks
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + CHUNK_CHARS, len(text))
        chunks.append(text[start:end])
        if end >= len(text):
            break
        start = end - OVERLAP_CHARS

    # Embed one chunk at a time to avoid OOM
    embeddings = []
    for chunk in chunks:
        emb = model.encode(chunk, normalize_embeddings=True)
        embeddings.append(emb)
        torch.cuda.empty_cache()

    stacked = np.stack(embeddings, axis=0)
    mean_embedding = np.mean(stacked, axis=0)
    mean_embedding = mean_embedding / np.linalg.norm(mean_embedding)

    return mean_embedding.tolist()

# Test
test_emb = get_embedding("This is a test sentence about poetry and cognition.")
print(f"Embedding dimension: {len(test_emb)}")
print(f"Chunk size: {CHUNK_TOKENS} tokens ({CHUNK_CHARS} chars)")
print("Embedding function works!")

In [None]:
# Cell 6: Fetch all library texts
print("Fetching library texts...")

# Fetch in batches to avoid timeout
all_library_texts = []
batch_size = 100
offset = 0

while True:
    response = supabase.table("library_texts") \
        .select("id, file_name, content") \
        .is_("embedding", "null") \
        .range(offset, offset + batch_size - 1) \
        .execute()
    
    if not response.data:
        break
    
    all_library_texts.extend(response.data)
    offset += batch_size
    print(f"  Fetched {len(all_library_texts)} texts...")

print(f"\nTotal library texts to embed: {len(all_library_texts)}")

In [None]:
# Cell 7: Fetch all close readings
print("Fetching close readings...")

all_readings = []
offset = 0

while True:
    response = supabase.table("close_readings") \
        .select("id, file_name, content") \
        .is_("embedding", "null") \
        .range(offset, offset + batch_size - 1) \
        .execute()
    
    if not response.data:
        break
    
    all_readings.extend(response.data)
    offset += batch_size

print(f"Total close readings to embed: {len(all_readings)}")

In [None]:
# Cell 8: Embed library texts
from tqdm import tqdm
import gc

print(f"\nEmbedding {len(all_library_texts)} library texts...")
print("This will take a while. Progress is saved after each text.\n")

library_errors = []

for i, text_record in enumerate(tqdm(all_library_texts)):
    try:
        content = text_record.get("content", "")
        if not content or len(content.strip()) < 100:
            continue
        
        # Get embedding
        embedding = get_embedding(content)
        
        # Update database
        supabase.table("library_texts") \
            .update({"embedding": embedding}) \
            .eq("id", text_record["id"]) \
            .execute()
        
        # Clear GPU cache periodically
        if (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
    except Exception as e:
        library_errors.append({"file": text_record.get("file_name"), "error": str(e)})
        print(f"\nError on {text_record.get('file_name')}: {e}")

print(f"\nLibrary embedding complete!")
print(f"Errors: {len(library_errors)}")
if library_errors:
    for err in library_errors[:10]:
        print(f"  - {err['file']}: {err['error']}")

In [None]:
# Cell 9: Embed close readings
print(f"\nEmbedding {len(all_readings)} close readings...\n")

reading_errors = []

for i, reading in enumerate(tqdm(all_readings)):
    try:
        content = reading.get("content", "")
        if not content or len(content.strip()) < 100:
            continue
        
        embedding = get_embedding(content)
        
        supabase.table("close_readings") \
            .update({"embedding": embedding}) \
            .eq("id", reading["id"]) \
            .execute()
        
        if (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
    except Exception as e:
        reading_errors.append({"file": reading.get("file_name"), "error": str(e)})
        print(f"\nError on {reading.get('file_name')}: {e}")

print(f"\nClose readings embedding complete!")
print(f"Errors: {len(reading_errors)}")

In [None]:
# Cell 10: Verify embeddings
print("Verification:")

# Count embedded library texts
lib_count = supabase.table("library_texts") \
    .select("id", count="exact") \
    .not_.is_("embedding", "null") \
    .execute()
print(f"Library texts with embeddings: {lib_count.count}")

# Count embedded readings
read_count = supabase.table("close_readings") \
    .select("id", count="exact") \
    .not_.is_("embedding", "null") \
    .execute()
print(f"Close readings with embeddings: {read_count.count}")

In [None]:
# Cell 11: Test semantic search
print("\nTesting semantic search...\n")

test_query = "the unity of thought and feeling in metaphysical poetry"
query_embedding = get_embedding(test_query)

# Search library
results = supabase.rpc(
    "library_semantic_search",
    {"query_embedding": query_embedding, "match_count": 5}
).execute()

print(f"Query: '{test_query}'\n")
print("Top 5 library results:")
for r in results.data:
    print(f"  [{r['similarity']:.3f}] {r['author']} - {r['title']}")

# Search readings
results = supabase.rpc(
    "readings_semantic_search",
    {"query_embedding": query_embedding, "match_count": 5}
).execute()

print("\nTop 5 close reading results:")
for r in results.data:
    print(f"  [{r['similarity']:.3f}] {r['source_author']} - {r['source_title']}")

## Done!

Your library and close readings now have semantic embeddings. 

Next: Update your MCP server to include semantic search tools that use these embeddings.