In [1]:
%pip install sqlite-vec

Collecting sqlite-vec
  Downloading sqlite_vec-0.1.6-py3-none-win_amd64.whl.metadata (198 bytes)
Downloading sqlite_vec-0.1.6-py3-none-win_amd64.whl (281 kB)
Installing collected packages: sqlite-vec
Successfully installed sqlite-vec-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [5]:
import sqlite3
import sqlite_vec

db = sqlite3.connect("../../data/cleaned.db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

In [6]:
db.execute("SELECT vec_version()").fetchone()

('v0.1.6',)

In [7]:
# Generate embeddings for the publications
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

print(model)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

<FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder object at 0x000002BB90064800>


In [20]:
search_query = "generative ai"

# Generate embedding for the search query
if search_query.strip():
    query_embedding = model.encode([search_query])['dense_vecs'][0]
    
    # Perform KNN search to get top 100 matches
    cursor = db.execute('''
    SELECT publication_id, distance 
    FROM publications_vec_bge_m3 
    WHERE embedding MATCH ? 
    ORDER BY distance 
    LIMIT 100
    ''', [query_embedding])
    
    matches = cursor.fetchall()
    
    if matches:
        # Get publication details and authors for each match
        publication_ids = [str(match[0]) for match in matches]
        placeholders = ','.join(['?'] * len(publication_ids))
        
        query = f'''
        SELECT p.id, p.title, GROUP_CONCAT(u.name, '; ') as authors
        FROM publications p
        LEFT JOIN publication_user_mapping pum ON p.id = pum.publication_id
        LEFT JOIN users u ON pum.user_id = u.id
        WHERE p.id IN ({placeholders})
        GROUP BY p.id, p.title
        ORDER BY CASE p.id {' '.join([f'WHEN {pid} THEN {i}' for i, pid in enumerate(publication_ids)])} END
        '''
        
        cursor = db.execute(query, publication_ids)
        results = cursor.fetchall()
        
        # Compile top users
        user_counts = {}
        for _, _, users in results:
            if users:
                for author in users.split('; '):
                    author = author.strip()
                    if author:
                        user_counts[author] = user_counts.get(author, 0) + 1
        
        # Sort users by frequency
        top_users = sorted(user_counts.items(), key=lambda x: x[1], reverse=True)

        # Print results
        print(f"Top 100 matches for query: '{search_query}'")
        print("=" * 50)
        for i, (pub_id, title, users) in enumerate(results, 1):
            distance = next(match[1] for match in matches if match[0] == pub_id)
            print(f"{i}. [{pub_id}] {title}")
            print(f"   Users: {users if users else 'No users found'}")
            print(f"   Distance: {distance:.4f}")
            print()

        print("\nTop Users in Results:")
        print("=" * 30)
        for i, (user, count) in enumerate(top_users[:20], 1):
            print(f"{i}. {user} ({count} publications)")

    else:
        print("No matches found for the query.")
else:
    print("Search query is empty. Please set the search_query variable.")

Top 100 matches for query: 'generative ai'
1. [32721] Kecerdasan buatan dengan metode ID3 finite state machine dalam turn-based strategy game
   Users: Leo Willyanto Santoso; Liliana
   Distance: 0.8765

2. [37428] Pengembangan aplikasi chatbot menggunakan LLAMA 2 dan framework Flutter sebagai tools dalam studi Alkitab
   Users: Henry Novianus Palit; Alexander Setiawan
   Distance: 0.8946

3. [35918] Generator musik menggunakan generative adversarial network untuk generasi musik klasik
   Users: Henry Novianus Palit; Kartika Gunadi
   Distance: 0.9039

4. [41694] Kecerdasan buatan dengan metode ID3 finite state machine dalam turn-based tactics game
   Users: Leo Willyanto Santoso
   Distance: 0.9126

5. [37914] Implementasi kecerdasan buatan pada tahap design thinking dalam perancangan interior. Studi kasus mahasiswa Universitas Kristen Petra program studi Desain Interior
   Users: No users found
   Distance: 0.9368

6. [24858] Pembuatan AI untuk turn-based strategy games menggunakan m