In [1]:
%pip install sqlite-vec

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install deep-translator

Note: you may need to restart the kernel to use updated packages.


In [1]:
import sqlite3
import sqlite_vec

db = sqlite3.connect("../../data/cleaned_with_bge_m3.db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

In [2]:
db.execute("SELECT vec_version()").fetchone()

('v0.1.6',)

In [5]:
# Get column information for the publications_vec_indobert table
cursor = db.execute("PRAGMA table_info(publications_vec_indobert)")
columns = cursor.fetchall()

# Print column information
print("Columns in publications_vec_indobert table:")
for col in columns:
    col_id, name, type_name, not_null, default_val, primary_key = col
    print(f"- {name} (Type: {type_name}, Primary Key: {'Yes' if primary_key else 'No'})")

# Get a sample row to see actual data (limit to 1 row)
try:
    sample = db.execute("SELECT * FROM publications_vec_indobert LIMIT 1").fetchone()
    if sample:
        print("\nSample data (first row):")
        for i, col in enumerate(columns):
            col_name = col[1]
            value = sample[i]
            # For embedding vector, just show length or first few elements
            if col_name == "embedding" and value is not None:
                print(f"- {col_name}: Vector with {len(value)} elements")
                print(f"  First few elements: {value[:5]}...")
            else:
                print(f"- {col_name}: {value}")
    else:
        print("\nNo data in the table.")
except Exception as e:
    print(f"\nError fetching sample: {e}")

Columns in publications_vec_indobert table:
- publication_id (Type: , Primary Key: Yes)
- embedding (Type: , Primary Key: No)

Sample data (first row):
- publication_id: 1
- embedding: Vector with 3072 elements
  First few elements: b'cVr\xbf\xb7'...


In [9]:
# Find publications without embeddings in publications_vec_indobert
query = """
SELECT p.id, p.title, p.abstract 
FROM publications p
LEFT JOIN publications_vec_indobert v ON p.id = v.publication_id
WHERE v.publication_id IS NULL
ORDER BY p.id
"""

missing_embeddings = db.execute(query).fetchall()

# Print the count and details of publications without embeddings
print(f"Found {len(missing_embeddings)} publications without embeddings in publications_vec_indobert table")

# Print first 10 as samples
for i, (pub_id, title, abstract) in enumerate(missing_embeddings[:10]):
    print(f"\nPublication ID: {pub_id}")
    print(f"Title: {title}")
    print(f"Abstract: {abstract[:200]}..." if abstract and len(abstract) > 200 else f"Abstract: {abstract}")
    
    if i >= 9 and len(missing_embeddings) > 10:
        print(f"\n... and {len(missing_embeddings) - 10} more publications without embeddings")

Found 5 publications without embeddings in publications_vec_indobert table

Publication ID: 559
Title: Kartu ucapan "selamat ulang tahun", penutup roda mobil minuman ringan "jelly", leaflet mobil "BMW", poster festival musik "pentas musik'98", .......
Abstract: None

Publication ID: 727
Title: Audit sumber daya manusia pada departemen produksi PT. Mustika Bahana Jaya di Lumajang
Abstract: Penelitian mengenai Audit Sumber Daya Manusia Pada Departemen Produksi PT. Mustika Bahana Jaya di Lumajang bertujuan untuk mengetahui pelaksanaan fungsi SDM yang meliputi: perencanaan tenaga kerja, re...

Publication ID: 750
Title: Pembuatan perangkat lunak image morphing dengan menggunakan algoritma featured based image metamorphosis
Abstract: Seiring dengan perkembangan teknologi pengolahan citra digital dan animasi dewasa ini, diperlukan adanya sebuah perangkat lunak yang dapat membantu pembuatan animasi. Sehingga dalam membuat sebuah ani...

Publication ID: 1026
Title: Perancangan komik bertema ce

In [None]:
# db.execute("DROP TABLE IF EXISTS publications_vec_indobert")
# db.execute("""
# CREATE VIRTUAL TABLE publications_vec_indobert USING vec0(
#     publication_id INTEGER PRIMARY KEY,
#     embedding FLOAT[768] distance_metric=cosine
# )
# """)
# db.commit()

In [11]:
from sentence_transformers import SentenceTransformer
from deep_translator import GoogleTranslator

model = SentenceTransformer('rahmanfadhil/indobert-finetuned-indonli')

print(model)
print(model.get_sentence_embedding_dimension())

  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformer(
  (0): Transformer({'max_seq_length': 75, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
768


In [7]:
# Query to get all tables in the database
tables = db.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()

# Print the table names
print("Tables in the database:")
for table in tables:
    print(f"- {table[0]}")

# Also get the virtual tables
virtual_tables = db.execute("SELECT name FROM sqlite_master WHERE type='table' AND sql LIKE '%VIRTUAL TABLE%'").fetchall()
if virtual_tables:
    print("\nVirtual tables:")
    for table in virtual_tables:
        print(f"- {table[0]}")

Tables in the database:
- users
- publications
- sqlite_sequence
- publication_user_mapping
- publications_vec_bge_m3_info
- publications_vec_bge_m3_chunks
- publications_vec_bge_m3_rowids
- publications_vec_bge_m3_vector_chunks00
- publications_vec_all_MiniLM_L6_v2_info
- publications_vec_all_MiniLM_L6_v2_chunks
- publications_vec_all_MiniLM_L6_v2_rowids
- publications_vec_all_MiniLM_L6_v2_vector_chunks00
- publications_vec_bge_m3
- publications_vec_all_MiniLM_L6_v2
- publications_vec_indobert
- publications_vec_indobert_info
- publications_vec_indobert_chunks
- publications_vec_indobert_rowids
- publications_vec_indobert_vector_chunks00

Virtual tables:
- publications_vec_bge_m3
- publications_vec_all_MiniLM_L6_v2
- publications_vec_indobert


In [8]:
# print(model.encode(["Hello world!"])[:10])
# print("haha")
# print(model.encode("Hello world!")[:10])

print(model.encode(["Hello world!"]).shape)
print("haha")
print(model.encode("Hello world!").shape)
# print(model.encode("Hello world!")[0])

(1, 768)
haha
(768,)


In [14]:
# Get publications data and generate embeddings
publications = db.execute("SELECT id, title, abstract FROM publications").fetchall()

# Create translator instance once outside the loop
translator = GoogleTranslator(source='auto', target='indonesian')

for pub_id, title, abstract in publications:
    # Print progress every 1% processed
    if (pub_id - 1) % max(1, len(publications) // 100) == 0:
        print(f"Processing publication {pub_id} ({((pub_id - 1) / len(publications) * 100):.1f}%)")
        
    # Check if embedding already exists
    existing = db.execute(
        "SELECT 1 FROM publications_vec_indobert WHERE publication_id = ?", 
        (pub_id,)
    ).fetchone()

    if existing:
        continue
    
    # Create merged string
    text = f"{title}: {abstract}"
    
    # Translate text to Indonesian
    try:
        translated_text = translator.translate(text)
        # print(f"Translated text for publication {pub_id}")
        
        print(translated_text[:1000])  # Print first 1000 characters of translated text for debugging
        
        # Generate embedding for the translated text
        embedding = model.encode(translated_text)
        
        print(f"Generated embedding for publication {pub_id} with shape {embedding.shape}")
        
        # Insert into vector table
        db.execute(
            "INSERT INTO publications_vec_indobert (publication_id, embedding) VALUES (?, ?)",
            (pub_id, embedding)
        )
        
        print(f"Inserted embedding for publication {pub_id}")
        
        db.commit()
        
    except Exception as e:
        print(f"Translation error for publication {pub_id}: {e}")
        print(f"Skipping embedding generation for publication {pub_id}")
        # Skip embedding generation if translation fails
        continue

db.commit()

# vacuum
db.execute("VACUUM")
print(f"Inserted embeddings into publications_vec_indobert")

Processing publication 1 (0.0%)
Processing publication 455 (1.0%)
Processing publication 909 (2.0%)
Processing publication 1363 (3.0%)
Processing publication 1817 (4.0%)
Processing publication 2271 (5.0%)
Processing publication 2725 (6.0%)
Processing publication 3179 (7.0%)
Processing publication 3633 (8.0%)
Processing publication 4087 (9.0%)
Processing publication 4541 (10.0%)
Processing publication 4995 (11.0%)
Processing publication 5449 (12.0%)
Processing publication 5903 (13.0%)
Processing publication 6357 (14.0%)
Processing publication 6811 (15.0%)
Processing publication 7265 (16.0%)
Processing publication 7719 (17.0%)
Processing publication 8173 (18.0%)
Processing publication 8627 (19.0%)
Processing publication 9081 (20.0%)
Processing publication 9535 (21.0%)
Processing publication 9989 (22.0%)
Processing publication 10443 (22.9%)
Processing publication 10897 (23.9%)
Processing publication 11351 (24.9%)
Processing publication 11805 (25.9%)
Processing publication 12259 (26.9%)
Pr