In [1]:
%pip install sqlite-vec

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sqlite3
import sqlite_vec

db = sqlite3.connect("../../data/cleaned_with_bge_m3.db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

In [3]:
db.execute("SELECT vec_version()").fetchone()

('v0.1.6',)

In [3]:
# Get all tables in the database
tables = db.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()

# Print the tables in a more readable format
print("Tables in the database:")
print("=" * 30)
for i, table in enumerate(tables, 1):
    print(f"{i}. {table[0]}")

Tables in the database:
1. users
2. publications
3. sqlite_sequence
4. publication_user_mapping
5. publications_vec_bge_m3_info
6. publications_vec_bge_m3_chunks
7. publications_vec_bge_m3_rowids
8. publications_vec_bge_m3_vector_chunks00
9. publications_vec_all_MiniLM_L6_v2_info
10. publications_vec_all_MiniLM_L6_v2_chunks
11. publications_vec_all_MiniLM_L6_v2_rowids
12. publications_vec_all_MiniLM_L6_v2_vector_chunks00
13. publications_vec_indobert_info
14. publications_vec_indobert_chunks
15. publications_vec_indobert_rowids
16. publications_vec_indobert_vector_chunks00
17. publications_vec_bge_m3
18. publications_vec_all_MiniLM_L6_v2
19. publications_vec_indobert


In [19]:
# tables = db.execute("SELECT publication_id FROM publications_vec_bge_m3;").fetchall()
# print(len(tables))
column_names = db.execute("PRAGMA table_info(publications_vec_bge_m3)").fetchall()
print(column_names)
tables = db.execute("SELECT * FROM publications_vec_bge_m3 LIMIT 1;").fetchall()
print(tables)
# tables = db.execute("SELECT * FROM users;").fetchall()
# print(len(tables))
# column_names = db.execute("PRAGMA table_info(users)").fetchall()
# print(column_names)
# for row in tables:
#     print(row)

[(0, 'publication_id', '', 1, None, 1), (1, 'embedding', '', 0, None, 0)]
[(1, b'\xe2\xc7f\xbc\x7f0\x889|W\x12\xbc\x15S\xe1<~\xf1*\xbc\xbb\x88\xca<a)\xcf;,\x93Q\xbc9\xc0\xc5\xbcP\xcb(<\x08\x9e\xc5\xbc\x82\x14\x97\xbb\x03{\xb1\xbc\t\xf8\xda\xba(\xcd0\xbd\xf9b\xb7<\xef\xce)=\xeb\xc6\xee\xbc\t\xe8*\xbd\x9a\xf71\xbd\xf4\x0eS<\xadrJ\xbc\x8a.\x93\xbb\xf0\x16`\xbd\xfa&\xb3\xbc\xd1\x18\xaf<\xdeN\x00\xbc\xb5\xb6\xcc\xbcS\xa3\x95\xbc\x82\xf2\x12\xbb<\xb8\x94\xbc\xfb\x85\x0f=\x99\xa23=o\x0f\xb1\xbbq\xa20\xbd\xd9\xe2!\xbdz\xf6\xcb:\x92^\x1b\xbd=\xed\xab\xbcX\xb29=\x0b\xbdw=\x0e\x12\x06\xbc\xfd$\xc2\xbb\xfe\x10\xcf\xbc\x85\x87\x81<\x93\xed\x81\xbd\x93\xa74<(\xb8\x10\xbdi.\xf9\xba\xb5\xd7\xdf<pCy;\xae\xe4\x85<S\x16\xca;q\x05y\xbdf\x18(\xbc`\x98S=\xe3d\xab\xbb\xaa\xc9j=\xcf\xddq\xbd]\xea\r<\xa0:\xb4\xbc\xcaCg<\x1e\xe0\xab\xbcR$\xa3;\xf9\xf9\x89<\xa3\xff\xcf;\xd8bj<I\xc0\xc1<\xe6\x01\x8c\xbc\xb4\xcd\xdb\xbc\xda\xbe\xbe;\xcb\'q=\xa0\x0f\xd7;&\xc6\xb1\xbcP\xffO\xbd\xbdE\xe8;\x8f\xf4\xe9\xbc\x80\xb4C\xbb

In [9]:
# Generate embeddings for the publications
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

print(model)

Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]


<FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder object at 0x000001ED3F9FEE50>


In [13]:
search_query = "generative ai"

# Generate embedding for the search query
if search_query.strip():
    query_embedding = model.encode([search_query])['dense_vecs'][0]
    
    # Perform KNN search to get top 100 matches
    cursor = db.execute('''
    SELECT publication_id, distance 
    FROM publications_vec_bge_m3 
    WHERE embedding MATCH ? 
    ORDER BY distance 
    LIMIT 100
    ''', [query_embedding])
    
    matches = cursor.fetchall()
    
    if matches:
        # Get publication details and authors for each match
        publication_ids = [str(match[0]) for match in matches]
        placeholders = ','.join(['?'] * len(publication_ids))
        
        query = f'''
        SELECT p.id, p.title, GROUP_CONCAT(u.name, '; ') as authors
        FROM publications p
        LEFT JOIN publication_user_mapping pum ON p.id = pum.publication_id
        LEFT JOIN users u ON pum.user_id = u.id
        WHERE p.id IN ({placeholders})
        GROUP BY p.id, p.title
        ORDER BY CASE p.id {' '.join([f'WHEN {pid} THEN {i}' for i, pid in enumerate(publication_ids)])} END
        '''
        
        cursor = db.execute(query, publication_ids)
        results = cursor.fetchall()
        
        # Compile top users
        user_counts = {}
        for _, _, users in results:
            if users:
                for author in users.split('; '):
                    author = author.strip()
                    if author:
                        user_counts[author] = user_counts.get(author, 0) + 1
        
        # Sort users by frequency
        top_users = sorted(user_counts.items(), key=lambda x: x[1], reverse=True)

        # Print results
        print(f"Top 100 matches for query: '{search_query}'")
        print("=" * 50)
        for i, (pub_id, title, users) in enumerate(results, 1):
            distance = next(match[1] for match in matches if match[0] == pub_id)
            print(f"{i}. [{pub_id}] {title}")
            print(f"   Users: {users if users else 'No users found'}")
            print(f"   Distance: {distance:.4f}")
            print()

        print("\nTop Users in Results:")
        print("=" * 30)
        for i, (user, count) in enumerate(top_users[:20], 1):
            print(f"{i}. {user} ({count} publications)")

    else:
        print("No matches found for the query.")
else:
    print("Search query is empty. Please set the search_query variable.")

Top 100 matches for query: 'generative ai'
1. [32721] Kecerdasan buatan dengan metode ID3 finite state machine dalam turn-based strategy game
   Users: Leo Willyanto Santoso; Liliana
   Distance: 0.8765

2. [37428] Pengembangan aplikasi chatbot menggunakan LLAMA 2 dan framework Flutter sebagai tools dalam studi Alkitab
   Users: Henry Novianus Palit; Alexander Setiawan
   Distance: 0.8946

3. [35918] Generator musik menggunakan generative adversarial network untuk generasi musik klasik
   Users: Henry Novianus Palit; Kartika Gunadi
   Distance: 0.9039

4. [41694] Kecerdasan buatan dengan metode ID3 finite state machine dalam turn-based tactics game
   Users: Leo Willyanto Santoso
   Distance: 0.9126

5. [37914] Implementasi kecerdasan buatan pada tahap design thinking dalam perancangan interior. Studi kasus mahasiswa Universitas Kristen Petra program studi Desain Interior
   Users: No users found
   Distance: 0.9368

6. [24858] Pembuatan AI untuk turn-based strategy games menggunakan m