In [1]:
import mysql.connector
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import sqlite3

# Import our custom modules
from thesis_search_engine_sqlite import SearchEngine
# from search_utils import get_embedding, print_formatted_results

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
db_path = '../../data/cleaned_with_bge_m3.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [8]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
# Create a search engine manager instance
search_manager = SearchEngine(model=model, cursor=cursor)

# Load the search engine (first time will load from database)
try:
    search_engine = search_manager.load(use_title=True, use_abstract=False)
    print("\nSearch engine ready for queries!")
except Exception as e:
    print(f"Error initializing search engine: {e}")

Creating new search engine instance...
Using title embeddings (iniLM-L6-v2)
Error loading search engine: no such table: dewey_papers

Search engine ready for queries!


In [None]:
# Cell to reload the index with different parameters if needed

# Example usage (commented out to prevent accidental execution)
'''
# Reload with different parameters
search_engine = search_manager.reload(use_title=False, use_abstract=True, limit=1000)

# Check that it worked
if search_engine:
    print(f"Successfully reloaded search engine")
    # Run a test search
    results = search_engine.search("machine learning", top_k=2)
    print_formatted_results(results)
'''

'\n# Reload with different parameters\nsearch_engine = reload_search_engine(use_title=False, use_abstract=True, limit=1000)\n\n# Check that it worked\nif search_engine:\n    print(f"Successfully reloaded search engine")\n    # Run a test search\n    results = search_engine.search("machine learning", top_k=2)\n    print_formatted_results(results)\n'

In [6]:
# Quick search cell - run this for instant searches using the cached engine

# Examples - uncomment and run any of these searches

# Basic search
results = search_manager.quick_search("diabetes detection with machine learning", top_k=10)

# # Search without abstracts (more compact output)
# results = search_manager.quick_search("natural language processing", top_k=5, show_abstract=False)

'''
# Search by author and topic
filtered_results = search_manager.search_by_people_and_topic(
    "artificial intelligence", 
    author_name="Johnson",  # Replace with a name in your database
    top_k=3
)
'''

Searching for: 'diabetes detection with machine learning'

Result 1: Perbandingan algoritma Naive-Bayes, K-NN dan Decision Tree dalam pengklasifikasian data penyakit diabetes
  Similarity: 0.5666
  Distance: 0.7650
  Authors: NICOLAS OWEN
  Contributors:
    Advisor 1: Alexander Setiawan
    Advisor 2: Henry Novianus Palit, S.Kom., M.Kom., Ph.D.
    Examination Committee 1: Agustinus Noertjahyana
    Examination Committee 2: Rolly Intan
  Abstract: Diabetes adalah kondisi kronis yang memiliki dampak serius pada kesehatan. Kondisi ini dapat menyebabkan kerusakan pada organ tubuh seperti mata, ginjal, saraf, serta memengaruhi jantung dan pembuluh darah. Akibatnya, risiko terkena stroke, serangan jantung, gangguan penglihatan, amputasi, dan ga...

Result 2: Perancangan buku interaktif pencegahan diabetes melitus pada anak-anak
  Similarity: 0.4966
  Distance: 1.0136
  Authors: JESSICA W WILIANTO
  Contributors:
    Advisor 1: Andrian Dektisa Hagijanto
    Advisor 2: Jacky Cahyadi
    Exam

'\n# Search by author and topic\nfiltered_results = search_manager.search_by_people_and_topic(\n    "artificial intelligence", \n    author_name="Johnson",  # Replace with a name in your database\n    top_k=3\n)\n'

In [None]:
# Demonstration of using the SearchEngine class

# Get the cached search engine or load if not already loaded
try:
    # Use the cached search engine
    search_engine = search_manager.load(use_title=True, use_abstract=False)
    
    if search_engine is not None:
        # Example queries
        example_queries = [
            "natural language processing in healthcare",
            # "machine learning for computer vision",
            # "data mining techniques",
            # "information retrieval systems",
            # "neural networks for image classification"
        ]
        
        # Run a sample search
        print("\nSample search results:")
        for query in example_queries[:1]:  # Just show results for first query
            print(f"\nQuery: '{query}'")
            # Use the manager for searching
            search_manager.quick_search(query, top_k=3)
        
        print("\nTo search for papers similar to your query:")
        print("results = search_manager.quick_search('your query here', top_k=5)")
    
except Exception as e:
    print(f"Error using the search engine: {e}")
    print("Make sure you have papers with embeddings in your database.")


Using title embeddings (iniLM-L6-v2)
Loaded 41597 papers with embeddings
Loaded 41597 papers with embeddings
Found authors for 41597 out of 41597 papers
Found authors for 41597 out of 41597 papers
Found contributors for 41597 out of 41597 papers
Built FAISS index with 41597 vectors of dimension 384

Sample search results:

Query: 'neural networks for image classification'

Result 1: Improving backpropagation training time and its generalization using pruning
  Similarity: 0.4673
  Authors: DANIEL BUDIONO
  Contributors:
    Advisor 1: LILIANA
    Examination Committee 1: Gregorius Satiabudhi
  Abstract: Beberapa tahun terakhir, banyak algoritma jaringan syaraf tiruan uang dikembangkan untuk klasifikasi pola. Salah satu algoritma yang populer adalah backpropagation. Akan tetapi, menentukan besarnya suatu jaringan backpropagation adalah suatu masalah yang sangat sulit. Jumlah hidden unit yang terlalu banyak akan menyebabkan jaringan terlalu menghafal data training dan kurang generalisasi