In [1]:
%pip install sqlite-vec

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sqlite3
import sqlite_vec

db = sqlite3.connect("../../data/cleaned.db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

In [6]:
db.execute("SELECT vec_version()").fetchone()

('v0.1.6',)

In [None]:
db.execute("DROP TABLE IF EXISTS publications_vec_bge_m3")
db.execute("""
CREATE VIRTUAL TABLE publications_vec_bge_m3 USING vec0(
    publication_id INTEGER PRIMARY KEY,
    embedding FLOAT[1024] distance_metric=cosine
)
""")
db.commit()

In [7]:
# Generate embeddings for the publications
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

print(model)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

<FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder object at 0x0000018E24F59B50>


In [12]:
# Get publications data and generate embeddings
publications = db.execute("SELECT id, title, abstract FROM publications").fetchall()

for pub_id, title, abstract in publications:
    # Print progress every 1% processed
    if (pub_id - 1) % max(1, len(publications) // 100) == 0:
        print(f"Processing publication {pub_id} ({((pub_id - 1) / len(publications) * 100):.1f}%)")
        
    # Check if embedding already exists
    existing = db.execute(
        "SELECT 1 FROM publications_vec_bge_m3 WHERE publication_id = ?", 
        (pub_id,)
    ).fetchone()

    if existing:
        continue
    
    # Create merged string
    text = f"{title}: {abstract}"
    
    # Generate embedding
    embedding = model.encode([text])['dense_vecs'][0]
    
        
    # Insert into vector table
    db.execute(
        "INSERT INTO publications_vec_bge_m3 (publication_id, embedding) VALUES (?, ?)",
        (pub_id, embedding)
    )
    
    db.commit()

db.commit()

# vacuum
db.execute("VACUUM")
print(f"Inserted {len(publications)} embeddings into publications_vec_bge_m3")

Processing publication 1 (0.0%)
Processing publication 455 (1.0%)
Processing publication 909 (2.0%)
Processing publication 1363 (3.0%)
Processing publication 1817 (4.0%)
Processing publication 2271 (5.0%)
Processing publication 2725 (6.0%)
Processing publication 3179 (7.0%)
Processing publication 3633 (8.0%)
Processing publication 4087 (9.0%)
Processing publication 4541 (10.0%)
Processing publication 4995 (11.0%)
Processing publication 5449 (12.0%)
Processing publication 5903 (13.0%)
Processing publication 6357 (14.0%)
Processing publication 6811 (15.0%)
Processing publication 7265 (16.0%)
Processing publication 7719 (17.0%)
Processing publication 8173 (18.0%)
Processing publication 8627 (19.0%)
Processing publication 9081 (20.0%)
Processing publication 9535 (21.0%)
Processing publication 9989 (22.0%)
Processing publication 10443 (22.9%)
Processing publication 10897 (23.9%)
Processing publication 11351 (24.9%)
Processing publication 11805 (25.9%)
Processing publication 12259 (26.9%)
Pr