In [35]:
import mysql.connector
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [36]:
database = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
)
cursor = database.cursor()
cursor.execute("USE nlp_thesis_similarity")

In [37]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [38]:
import numpy as np
import faiss
import json
from typing import List, Dict, Any, Optional, Tuple, Union

def get_embedding(text, model):
    # Handle None values or empty strings
    if text is None or text == "":
        return None
    # Convert text to embedding using the provided model
    return model.encode(text).tolist()

class ThesisSimilaritySearch:
    def __init__(self, model, use_title=False, use_abstract=True):
        """
        Initialize the similarity search engine
        
        Args:
            model: The SentenceTransformer model to use for encoding queries
            use_title: Whether to use title embeddings for similarity search
            use_abstract: Whether to use abstract embeddings for similarity search
        """
        self.model = model
        self.use_title = use_title
        self.use_abstract = use_abstract
        
        # Model name is used in column names
        self.model_name = model._modules['0'].auto_model.config.name_or_path
        
        # These will be populated when load_index is called
        self.index = None
        self.paper_ids = []
        self.paper_metadata = {}
    
    def load_index(self, cursor, limit=None):
        """
        Load embeddings from database and build FAISS index
        
        Args:
            cursor: MySQL database cursor
            limit: Optional limit on number of papers to load
        """
        # Determine which embedding column to use
        if self.use_abstract and not self.use_title:
            embedding_column = f"abstract_embeddings_{self.model_name[22:]}"  # Remove 'sentence-transformers/' prefix
            print(f"Using abstract embeddings ({embedding_column[22:]})")
        elif self.use_title and not self.use_abstract:
            embedding_column = f"title_embeddings_{self.model_name[22:]}"  # Remove 'sentence-transformers/' prefix
            print(f"Using title embeddings ({embedding_column[22:]})")
        else:
            # Default to abstract if both or neither are specified
            embedding_column = f"abstract_embeddings_{self.model_name[22:]}"  # Remove 'sentence-transformers/' prefix
            print(f"Using abstract embeddings ({embedding_column[22:]})")
        
        # Build query with limit if provided
        query = f"""
        SELECT id, title, abstract, `{embedding_column}`
        FROM dewey_papers 
        WHERE `{embedding_column}` IS NOT NULL
        """
        if limit:
            query += f" LIMIT {limit}"
            
        # Execute query
        cursor.execute(query)
        papers = cursor.fetchall()
        
        if not papers:
            raise ValueError(f"No papers found with {embedding_column} not null")
            
        print(f"Loaded {len(papers)} papers with embeddings")
        
        # Extract data for index building
        self.paper_ids = []
        self.paper_metadata = {}
        embeddings = []
        
        for paper in papers:
            paper_id = paper[0]
            title = paper[1]
            abstract = paper[2]
            embedding_json = paper[3]
            
            # Skip papers with null embeddings
            if embedding_json is None:
                continue
                
            # Parse JSON embedding if it's a string
            if isinstance(embedding_json, str):
                try:
                    embedding = json.loads(embedding_json)
                except json.JSONDecodeError:
                    print(f"Error decoding embedding for paper {paper_id}")
                    continue
            else:
                embedding = embedding_json
                
            # Add to our collections
            self.paper_ids.append(paper_id)
            self.paper_metadata[paper_id] = {
                'id': paper_id,
                'title': title,
                'abstract': abstract,
                'authors': [],     # Initialize empty authors list to be filled later
                'contributors': []  # Initialize empty contributors list to be filled later
            }
            embeddings.append(embedding)
        
        # Fetch authors and contributors for all papers
        self._fetch_authors(cursor)
        self._fetch_contributors(cursor)
        
        # Convert embeddings to numpy array
        embeddings_array = np.array(embeddings).astype('float32')
        
        # Create and build the FAISS index
        dimension = embeddings_array.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(embeddings_array)
        
        print(f"Built FAISS index with {self.index.ntotal} vectors of dimension {dimension}")
        return self.index
    
    def _fetch_authors(self, cursor):
        """
        Fetch authors for all papers in paper_metadata
        
        Args:
            cursor: MySQL database cursor
        """
        if not self.paper_ids:
            return
            
        # Format paper IDs for SQL IN clause
        paper_ids_str = ", ".join([f"'{pid}'" for pid in self.paper_ids])
        
        # Query to get authors for all papers in one go
        query = f"""
        SELECT pc.paper_id, c.name 
        FROM paper_creators pc
        JOIN creators c ON pc.creator_id = c.id
        WHERE pc.paper_id IN ({paper_ids_str})
        ORDER BY pc.paper_id, c.name
        """
        
        cursor.execute(query)
        author_results = cursor.fetchall()
        
        # Group authors by paper_id
        for paper_id, author_name in author_results:
            if paper_id in self.paper_metadata:
                self.paper_metadata[paper_id]['authors'].append(author_name)
        
        # Count papers with authors
        papers_with_authors = sum(1 for pid in self.paper_ids if self.paper_metadata[pid]['authors'])
        print(f"Found authors for {papers_with_authors} out of {len(self.paper_ids)} papers")
        
    def _fetch_contributors(self, cursor):
        """
        Fetch contributors for all papers in paper_metadata
        
        Args:
            cursor: MySQL database cursor
        """
        if not self.paper_ids:
            return
            
        # Format paper IDs for SQL IN clause
        paper_ids_str = ", ".join([f"'{pid}'" for pid in self.paper_ids])
        
        # Query to get contributors for all papers in one go
        query = f"""
        SELECT pc.paper_id, c.name, pc.role
        FROM paper_contributors pc
        JOIN contributors c ON pc.contributor_id = c.id
        WHERE pc.paper_id IN ({paper_ids_str})
        ORDER BY pc.paper_id, pc.role, c.name
        """
        
        cursor.execute(query)
        contributor_results = cursor.fetchall()
        
        # Group contributors by paper_id
        for paper_id, contributor_name, role in contributor_results:
            if paper_id in self.paper_metadata:
                self.paper_metadata[paper_id]['contributors'].append({
                    'name': contributor_name,
                    'role': role
                })
        
        # Count papers with contributors
        papers_with_contributors = sum(1 for pid in self.paper_ids if self.paper_metadata[pid]['contributors'])
        print(f"Found contributors for {papers_with_contributors} out of {len(self.paper_ids)} papers")
    
    def search(self, query_text: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Search for similar papers using the provided query text
        
        Args:
            query_text: The text query to search for
            top_k: Number of results to return
            
        Returns:
            List of dictionaries containing search results with metadata and similarity scores
        """
        if self.index is None:
            raise ValueError("Index not built. Call load_index first.")
            
        # Convert query to embedding
        query_embedding = np.array(get_embedding(query_text, self.model)).reshape(1, -1).astype('float32')
        
        # Search the index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.paper_ids):
                paper_id = self.paper_ids[idx]
                metadata = self.paper_metadata[paper_id]
                
                # Calculate similarity score (convert distance to similarity)
                similarity = 1 / (1 + distances[0][i])
                
                results.append({
                    'id': paper_id,
                    'title': metadata['title'],
                    'abstract': metadata['abstract'],
                    'authors': metadata['authors'],
                    'contributors': metadata['contributors'],
                    'similarity_score': similarity,
                    'distance': float(distances[0][i])
                })
        
        return results

In [39]:
# Global variable to store the loaded search engine instance
search_engine_global = None

# Import the print_formatted_results function
from print_formatted_results import print_formatted_results

def load_search_engine(model, use_title=True, use_abstract=False, limit=None, force_reload=False):
    """
    Load or return the cached search engine instance
    
    Args:
        model: The SentenceTransformer model to use
        use_title: Whether to use title embeddings
        use_abstract: Whether to use abstract embeddings
        limit: Optional limit on number of papers to load
        force_reload: Whether to force reload the index even if already loaded
        
    Returns:
        ThesisSimilaritySearch instance with loaded index
    """
    global search_engine_global
    
    # Check if we already have a loaded search engine and we're not forcing a reload
    if search_engine_global is not None and not force_reload:
        print("Using cached search engine instance...")
        return search_engine_global
    
    # Create a new search engine instance
    print("Creating new search engine instance...")
    search_engine = ThesisSimilaritySearch(model=model, use_title=use_title, use_abstract=use_abstract)
    
    # Load the index
    try:
        search_engine.load_index(cursor, limit=limit)
        # Cache the loaded search engine
        search_engine_global = search_engine
        return search_engine
    except Exception as e:
        print(f"Error loading search engine: {e}")
        return None

# Example usage:
# Load the search engine (first time will load from database)
try:
    search_engine = load_search_engine(model=model, use_title=True, use_abstract=False)
    print("\nSearch engine ready for queries!")
except Exception as e:
    print(f"Error initializing search engine: {e}")

Creating new search engine instance...
Using title embeddings (iniLM-L6-v2)
Loaded 41597 papers with embeddings
Loaded 41597 papers with embeddings
Found authors for 41597 out of 41597 papers
Found authors for 41597 out of 41597 papers
Found contributors for 41597 out of 41597 papers
Built FAISS index with 41597 vectors of dimension 384

Search engine ready for queries!
Found contributors for 41597 out of 41597 papers
Built FAISS index with 41597 vectors of dimension 384

Search engine ready for queries!


In [42]:
# Cell to reload the index with different parameters if needed

def reload_search_engine(use_title=True, use_abstract=False, limit=None):
    """
    Force reload the search engine with different parameters
    
    Args:
        use_title: Whether to use title embeddings
        use_abstract: Whether to use abstract embeddings
        limit: Optional limit on number of papers to load
    """
    global search_engine_global
    print("Forcing reload of search engine...")
    
    # Set the global variable to None to ensure we reload
    search_engine_global = None
    
    # Call load_search_engine with force_reload=True
    return load_search_engine(
        model=model, 
        use_title=use_title, 
        use_abstract=use_abstract, 
        limit=limit, 
        force_reload=True
    )

# Example usage (commented out to prevent accidental execution)
'''
# Reload with different parameters
search_engine = reload_search_engine(use_title=False, use_abstract=True, limit=1000)

# Check that it worked
if search_engine:
    print(f"Successfully reloaded search engine")
    # Run a test search
    results = search_engine.search("machine learning", top_k=2)
    print_formatted_results(results)
'''

'\n# Reload with different parameters\nsearch_engine = reload_search_engine(use_title=False, use_abstract=True, limit=1000)\n\n# Check that it worked\nif search_engine:\n    print(f"Successfully reloaded search engine")\n    # Run a test search\n    results = search_engine.search("machine learning", top_k=2)\n    print_formatted_results(results)\n'

In [43]:
# Utility function for displaying search results with customizable options

def print_formatted_results(results, show_authors=True, show_contributors=True, show_abstract=True, show_metrics=True):
    """
    Print formatted search results with customizable display options
    
    Args:
        results: List of search results from ThesisSimilaritySearch.search()
        show_authors: Whether to display author information
        show_contributors: Whether to display contributor information
        show_abstract: Whether to display the abstract
        show_metrics: Whether to display similarity score and distance
    """
    if not results:
        print("No results found.")
        return
        
    for i, result in enumerate(results):
        print(f"\nResult {i+1}: {result['title']}")
        
        # Display metrics if requested
        if show_metrics:
            print(f"  Similarity: {result['similarity_score']:.4f}")
            print(f"  Distance: {result['distance']:.4f}")
        
        # Display authors if requested
        if show_authors and 'authors' in result:
            if result['authors']:
                authors_str = ", ".join(result['authors'])
                print(f"  Authors: {authors_str}")
            else:
                print(f"  Authors: No author information available")
        
        # Display contributors if requested
        if show_contributors and 'contributors' in result:
            if result['contributors']:
                # Group contributors by role
                contributors_by_role = {}
                for contributor in result['contributors']:
                    role = contributor['role']
                    if role not in contributors_by_role:
                        contributors_by_role[role] = []
                    contributors_by_role[role].append(contributor['name'])
                
                # Print contributors by role
                print(f"  Contributors:")
                for role, names in sorted(contributors_by_role.items()):
                    contributors_str = ", ".join(names)
                    print(f"    {role}: {contributors_str}")
            else:
                print(f"  Contributors: No contributor information available")
        
        # Display abstract if requested
        if show_abstract and 'abstract' in result:
            if result['abstract']:
                # Truncate very long abstracts for display
                abstract = result['abstract']
                if len(abstract) > 300:
                    abstract = abstract[:297] + '...'
                print(f"  Abstract: {abstract}")
            else:
                print(f"  Abstract: No abstract available")

In [57]:
# Quick search cell - run this for instant searches using the cached engine

def quick_search(query, top_k=5, show_abstract=True):
    """
    Perform a quick search using the cached search engine
    
    Args:
        query: The search query
        top_k: Number of results to return
        show_abstract: Whether to display abstracts
    """
    global search_engine_global
    
    # Check if we have a cached search engine
    if search_engine_global is None:
        print("No cached search engine found. Loading index first...")
        load_search_engine(model=model)
        if search_engine_global is None:
            print("Failed to load search engine.")
            return
    
    # Perform the search
    print(f"Searching for: '{query}'")
    results = search_engine_global.search(query, top_k=top_k)
    
    # Display results
    print_formatted_results(results, show_abstract=show_abstract)
    
    return results

# Examples - uncomment and run any of these searches

# Basic search
results = quick_search("diabetes detection with machine learning", top_k=10)

# # Search without abstracts (more compact output)
# results = quick_search("natural language processing", top_k=5, show_abstract=False)

'''
# Search by author and topic
filtered_results = search_by_people_and_topic(
    search_engine_global,
    "artificial intelligence", 
    author_name="Johnson",  # Replace with a name in your database
    top_k=3
)
print_formatted_results(filtered_results)
'''

Searching for: 'diabetes detection with machine learning'

Result 1: Perbandingan algoritma Naive-Bayes, K-NN dan Decision Tree dalam pengklasifikasian data penyakit diabetes
  Similarity: 0.5666
  Distance: 0.7650
  Authors: NICOLAS OWEN
  Contributors:
    Advisor 1: Alexander Setiawan
    Advisor 2: Henry Novianus Palit, S.Kom., M.Kom., Ph.D.
    Examination Committee 1: Agustinus Noertjahyana
    Examination Committee 2: Rolly Intan
  Abstract: Diabetes adalah kondisi kronis yang memiliki dampak serius pada kesehatan. Kondisi ini dapat menyebabkan kerusakan pada organ tubuh seperti mata, ginjal, saraf, serta memengaruhi jantung dan pembuluh darah. Akibatnya, risiko terkena stroke, serangan jantung, gangguan penglihatan, amputasi, dan ga...

Result 2: Perancangan buku interaktif pencegahan diabetes melitus pada anak-anak
  Similarity: 0.4966
  Distance: 1.0136
  Authors: JESSICA W WILIANTO
  Contributors:
    Advisor 1: Andrian Dektisa Hagijanto
    Advisor 2: Jacky Cahyadi
    Exam

'\n# Search by author and topic\nfiltered_results = search_by_people_and_topic(\n    search_engine_global,\n    "artificial intelligence", \n    author_name="Johnson",  # Replace with a name in your database\n    top_k=3\n)\nprint_formatted_results(filtered_results)\n'

In [None]:
# Demonstration of using the ThesisSimilaritySearch class with caching

# Get the cached search engine or load if not already loaded
try:
    # Use the cached search engine
    search_engine = load_search_engine(model=model, use_title=True, use_abstract=False)
    
    if search_engine is not None:
        # Example queries
        example_queries = [
            "natural language processing in healthcare",
            # "machine learning for computer vision",
            # "data mining techniques",
            # "information retrieval systems",
            # "neural networks for image classification"
        ]
        
        # Run a sample search
        print("\nSample search results:")
        for query in example_queries[:2]:  # Just show results for first two queries
            print(f"\nQuery: '{query}'")
            results = search_engine.search(query, top_k=3)
            
            for i, result in enumerate(results):
                print(f"\nResult {i+1}: {result['title']}")
                print(f"  Similarity: {result['similarity_score']:.4f}")
                
                # Display authors
                if result['authors']:
                    authors_str = ", ".join(result['authors'])
                    print(f"  Authors: {authors_str}")
                else:
                    print(f"  Authors: No author information available")
                
                # Display contributors
                if result['contributors']:
                    # Group contributors by role
                    contributors_by_role = {}
                    for contributor in result['contributors']:
                        role = contributor['role']
                        if role not in contributors_by_role:
                            contributors_by_role[role] = []
                        contributors_by_role[role].append(contributor['name'])
                    
                    # Print contributors by role
                    print(f"  Contributors:")
                    for role, names in contributors_by_role.items():
                        contributors_str = ", ".join(names)
                        print(f"    {role}: {contributors_str}")
                else:
                    print(f"  Contributors: No contributor information available")
                    
                print(f"  Abstract: {result['abstract']}")
                print(f"  Distance: {result['distance']}")
        
        print("\nTo search for papers similar to your query:")
        print("results = search_engine.search('your query here', top_k=5)")
    
except Exception as e:
    print(f"Error using the search engine: {e}")
    print("Make sure you have papers with embeddings in your database.")


Using title embeddings (iniLM-L6-v2)
Loaded 41597 papers with embeddings
Loaded 41597 papers with embeddings
Found authors for 41597 out of 41597 papers
Found authors for 41597 out of 41597 papers
Found contributors for 41597 out of 41597 papers
Built FAISS index with 41597 vectors of dimension 384

Sample search results:

Query: 'neural networks for image classification'

Result 1: Improving backpropagation training time and its generalization using pruning
  Similarity: 0.4673
  Authors: DANIEL BUDIONO
  Contributors:
    Advisor 1: LILIANA
    Examination Committee 1: Gregorius Satiabudhi
  Abstract: Beberapa tahun terakhir, banyak algoritma jaringan syaraf tiruan uang dikembangkan untuk klasifikasi pola. Salah satu algoritma yang populer adalah backpropagation. Akan tetapi, menentukan besarnya suatu jaringan backpropagation adalah suatu masalah yang sangat sulit. Jumlah hidden unit yang terlalu banyak akan menyebabkan jaringan terlalu menghafal data training dan kurang generalisasi