In [1]:
import os, json, math
import pandas as pd
from typing import List, Dict
from supabase import create_client, Client
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'sentence_transformers'

In [3]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../secrets/.env.dev")
CSV_PATH = "tech_law_violations.csv"
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


In [None]:
# ---------- CHUNKING ----------
def simple_sentence_split(text: str) -> List[str]:
    # very light splitter; swap for spacy / nltk if you want
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]

def chunk_text(text: str, target_tokens=300, overlap_sents=1, tokenizer=None) -> List[str]:
    sents = simple_sentence_split(text)
    chunks, cur, cur_len = [], [], 0
    for s in sents:
        # crude token count if no tokenizer supplied
        t = len(s.split()) if tokenizer is None else len(tokenizer.tokenize(s))
        if cur_len + t <= target_tokens:
            cur.append(s); cur_len += t
        else:
            if cur:
                chunks.append(" ".join(cur))
                # overlap some sentences
                cur = cur[-overlap_sents:] + [s]
                cur_len = sum(len(x.split()) for x in cur) if tokenizer is None else \
                          sum(len(tokenizer.tokenize(x)) for x in cur)
            else:
                chunks.append(s)
                cur, cur_len = [], 0
    if cur: chunks.append(" ".join(cur))
    return chunks

# ---------- EMBEDDING ----------
MODEL_ID = "Qwen/Qwen3-Embedding-8B"  # HF
USE_4096 = True   # set False if your DB uses VECTOR(2000)

# Load model as per the model card
from transformers import AutoTokenizer, AutoModel
import torch, torch.nn.functional as F
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
model = AutoModel.from_pretrained(MODEL_ID)

def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    # from the model card usage
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    sequence_lengths = attention_mask.sum(dim=1) - 1
    batch_size = last_hidden_states.shape[0]
    return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def embed_texts(texts: List[str], batch_size=16, max_length=8192) -> np.ndarray:
    vecs = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            toks = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
            toks = {k: v.to(model.device) for k, v in toks.items()}
            out = model(**toks)
            pooled = last_token_pool(out.last_hidden_state, toks["attention_mask"])
            pooled = F.normalize(pooled, p=2, dim=1)  # cosine-friendly
            vecs.append(pooled.cpu().numpy())
    return np.vstack(vecs)
