In [None]:
#| default_exp index

from search.core import *
from collections import defaultdict
import re
from typing import Set, DefaultDict
from fastcore.test import *

In [None]:
#| export
class InvertedIndex(Index):
    "Basic inverted index implementation mapping terms to document IDs"
    def __init__(self):
        self.index: DefaultDict[str, Set[str]] = defaultdict(set)  # term -> doc_ids
        self.documents: Dict[str, Document] = {}  # doc_id -> Document
        
    def _tokenize(self, text: str) -> list[str]:
        "Basic tokenization - we'll improve this later with proper tokenization"
        return [w.lower() for w in re.findall(r'\w+', text)]
    
    def add(self, doc: Document):
        "Add a document to the index"
        self.documents[doc.id] = doc
        terms = self._tokenize(doc.content)
        for term in terms:
            self.index[term].add(doc.id)
            
    def remove(self, doc_id: str):
        "Remove a document from the index"
        if doc_id not in self.documents: return
        doc = self.documents[doc_id]
        terms = self._tokenize(doc.content)
        for term in terms:
            self.index[term].discard(doc_id)
        del self.documents[doc_id]
    
    def search(self, query: Query) -> list[SearchResult]:
        "Search using TF-IDF scoring"
        terms = self._tokenize(query.text)
        if not terms: return []
        
        # Get matching doc_ids
        matching_ids = set.intersection(*(self.index[term] for term in terms))
        
        # Score matches (simple term frequency for now)
        results = []
        for doc_id in matching_ids:
            doc = self.documents[doc_id]
            doc_terms = self._tokenize(doc.content)
            score = sum(doc_terms.count(term) for term in terms) / len(doc_terms)
            results.append(SearchResult(doc, score))
        
        return sorted(results, key=lambda x: x.score, reverse=True)
    
    def clear(self):
        "Clear all documents from the index"
        self.index.clear()
        self.documents.clear()

In [None]:
idx = InvertedIndex()

In [None]:
# Test adding and searching
doc1 = Document("1", "The quick brown fox jumps over the lazy dog")
doc2 = Document("2", "The lazy fox sleeps")
idx.add(doc1)
idx.add(doc2)

results = idx.search(Query("fox"))
test_eq(len(results), 2)
test_eq({r.document.id for r in results}, {"1", "2"})

In [None]:
# Test removal
idx.remove("1")
results = idx.search(Query("fox"))
test_eq(len(results), 1)
test_eq(results[0].document.id, "2")