# Dataset & Preprocessing

Sama seperti boolean retrieval (menggunakan satu directory yang sama) --> Hasil preprocess ada di ../data/documents.jsonl

# Import Library

In [1]:
import json
import math
import numpy as np
from collections import defaultdict, Counter
from typing import List, Dict, Tuple
import sys


In [None]:
sys.path.append('../preprocessing')
from preprocessor import DocumentPreprocessor # type: ignore

# Load Documents

In [3]:
def load_documents(filepath: str) -> List[Dict[str, str]]:
    """Load documents from JSONL file"""
    documents = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line.strip())
            documents.append(doc)
    return documents

In [50]:
documents = load_documents('../data/documents.jsonl')
print(f"Loaded {len(documents)} preprocessed documents")
print("\ndocuments:")
for doc in documents:
    print(f"{doc['id']}: {doc['contents']}")

Loaded 15 preprocessed documents

documents:
d1: cat chase small mous garden
d2: friendli dog play fetch river
d3: bm25 rank function wide use search engin
d4: boolean retriev use logic oper like
d5: tfidf weight term frequenc rariti
d6: neural retriev use dens embed semant search
d7: dog cat slept couch
d8: librari host workshop inform retriev
d9: student implement bm25 compar tfidf
d10: chef roast chicken rosemari garlic
d11: black cat cross old stone bridg night
d12: dog loyal companion long hike
d13: dataset contain fifteen short sentenc test
d14: rerank model reorder bm25 candid use transform
d15: dog snif cat ignor mous


# Build Vocabulary and Calculate term frequencies

In [6]:
def build_vocabulary(documents: List[Dict[str, str]]) -> Tuple[List[str], Dict[str, Dict[str, int]]]:
    """
    Build vocabulary from documents and calculate term frequencies
    Returns: (vocabulary, term_freq_per_doc)
    """
    vocabulary = set()
    term_freq_per_doc = {}
    
    for doc in documents:
        doc_id = doc['id']
        terms = doc['contents'].split()
        
        # Count term frequencies in this document
        term_freq = Counter(terms)
        term_freq_per_doc[doc_id] = term_freq
        
        # Add terms to vocabulary
        vocabulary.update(terms)
    
    return sorted(list(vocabulary)), term_freq_per_doc

In [54]:
vocabulary, term_freq_per_doc = build_vocabulary(documents)

print(f"Vocabulary size: {len(vocabulary)}")
print(f"\nterms: {vocabulary}")
print(f"\nTerm frequencies for all documents:")

for i in range(1, 16):
    doc_id = f'd{i}'
    print(f"\n{doc_id}:")
    print(f"  {dict(term_freq_per_doc[doc_id])}")

Vocabulary size: 68

terms: ['black', 'bm25', 'boolean', 'bridg', 'candid', 'cat', 'chase', 'chef', 'chicken', 'companion', 'compar', 'contain', 'couch', 'cross', 'dataset', 'dens', 'dog', 'embed', 'engin', 'fetch', 'fifteen', 'frequenc', 'friendli', 'function', 'garden', 'garlic', 'hike', 'host', 'ignor', 'implement', 'inform', 'librari', 'like', 'logic', 'long', 'loyal', 'model', 'mous', 'neural', 'night', 'old', 'oper', 'play', 'rank', 'rariti', 'reorder', 'rerank', 'retriev', 'river', 'roast', 'rosemari', 'search', 'semant', 'sentenc', 'short', 'slept', 'small', 'snif', 'stone', 'student', 'term', 'test', 'tfidf', 'transform', 'use', 'weight', 'wide', 'workshop']

Term frequencies for all documents:

d1:
  {'cat': 1, 'chase': 1, 'small': 1, 'mous': 1, 'garden': 1}

d2:
  {'friendli': 1, 'dog': 1, 'play': 1, 'fetch': 1, 'river': 1}

d3:
  {'bm25': 1, 'rank': 1, 'function': 1, 'wide': 1, 'use': 1, 'search': 1, 'engin': 1}

d4:
  {'boolean': 1, 'retriev': 1, 'use': 1, 'logic': 1, 'ope

# Calculate Document Frequency (DF) and IDF

In [11]:
def calculate_df_idf(vocabulary: List[str], documents: List[Dict[str, str]]) -> Tuple[Dict[str, int], Dict[str, float]]:
    """
    Calculate document frequency and inverse document frequency
    """
    N = len(documents)  # Total number of documents
    df = defaultdict(int)  # Document frequency for each term
    
    # Count in how many documents each term appears
    for doc in documents:
        terms = set(doc['contents'].split())  # Use set to count each term once per document
        for term in terms:
            df[term] += 1
    
    # Calculate IDF
    idf = {}
    for term in vocabulary:
        idf[term] = math.log10(N / df[term]) if df[term] > 0 else 0
    
    return dict(df), idf

In [12]:
df, idf = calculate_df_idf(vocabulary, documents)

In [None]:
print("Document Frequency (DF) and IDF for sample terms:\n")
print(f"{'Term':<15} {'DF':<5} {'IDF':<10}")
print("-" * 30)
for term in vocabulary:
    print(f"{term:<15} {df[term]:<5} {idf[term]:<10.4f}")

Document Frequency (DF) and IDF for sample terms:

Term            DF    IDF       
------------------------------
black           1     1.1761    
bm25            3     0.6990    
boolean         1     1.1761    
bridg           1     1.1761    
candid          1     1.1761    
cat             4     0.5740    
chase           1     1.1761    
chef            1     1.1761    
chicken         1     1.1761    
companion       1     1.1761    
compar          1     1.1761    
contain         1     1.1761    
couch           1     1.1761    
cross           1     1.1761    
dataset         1     1.1761    
dens            1     1.1761    
dog             4     0.5740    
embed           1     1.1761    
engin           1     1.1761    
fetch           1     1.1761    
fifteen         1     1.1761    
frequenc        1     1.1761    
friendli        1     1.1761    
function        1     1.1761    
garden          1     1.1761    
garlic          1     1.1761    
hike            1     1.176

# Calculate TF-IDF weights for all documents

In [15]:
def calculate_tf_idf(vocabulary: List[str], 
                     term_freq_per_doc: Dict[str, Dict[str, int]], 
                     idf: Dict[str, float],
                     documents: List[Dict[str, str]]) -> Dict[str, Dict[str, float]]:
    """
    Calculate TF-IDF weights for all documents
    TF = 1 + log10(tf) if tf > 0, else 0
    TF-IDF = TF * IDF
    """
    tfidf_weights = {}
    
    for doc in documents:
        doc_id = doc['id']
        tfidf_weights[doc_id] = {}
        
        for term in vocabulary:
            tf_raw = term_freq_per_doc[doc_id].get(term, 0)
            
            # Calculate normalized TF
            if tf_raw > 0:
                tf = 1 + math.log10(tf_raw)
            else:
                tf = 0
            
            # Calculate TF-IDF
            tfidf = tf * idf[term]
            tfidf_weights[doc_id][term] = tfidf
    
    return tfidf_weights

In [16]:
tfidf_weights = calculate_tf_idf(vocabulary, term_freq_per_doc, idf, documents)

In [27]:
print("TF-IDF Weights Table:\n")
docs = ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15']

print(f"{'Term':<15}", end="")
for doc_id in docs:
    print(f"{doc_id:<12}", end="")
print()
print("-" * 190)

for term in vocabulary:
    print(f"{term:<15}", end="")
    for doc_id in docs:
        weight = tfidf_weights[doc_id][term]
        print(f"{weight:<12.3f}", end="")
    print()

TF-IDF Weights Table:

Term           d1          d2          d3          d4          d5          d6          d7          d8          d9          d10         d11         d12         d13         d14         d15         
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
black          0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       1.176       0.000       0.000       0.000       0.000       
bm25           0.000       0.000       0.699       0.000       0.000       0.000       0.000       0.000       0.699       0.000       0.000       0.000       0.000       0.699       0.000       
boolean        0.000       0.000       0.000       1.176       0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       0.000       
br

# Create Document Vectors

In [28]:
def create_document_vectors(vocabulary: List[str], 
                           tfidf_weights: Dict[str, Dict[str, float]],
                           documents: List[Dict[str, str]]) -> Dict[str, np.ndarray]:
    """
    Create document vectors from TF-IDF weights
    Each document is represented as a vector of dimension len(vocabulary)
    """
    doc_vectors = {}
    
    for doc in documents:
        doc_id = doc['id']
        vector = np.array([tfidf_weights[doc_id][term] for term in vocabulary])
        doc_vectors[doc_id] = vector
    
    return doc_vectors

In [29]:
doc_vectors = create_document_vectors(vocabulary, tfidf_weights, documents)

In [None]:
print("Document Vectors (first 3 documents):\n")
print("=" * 100)

for doc_id in ['d1', 'd2', 'd3']:
    vector = doc_vectors[doc_id]
    
    # Show basic info
    print(f"\n{doc_id}:")
    print(f"  Shape: {vector.shape}") # -> 68 dimensi
    print(f"  Non-zero elements: {np.count_nonzero(vector)}")
    
    # Show non-zero dimensions only (menampilkan 68 terlalu banyak)
    non_zero_indices = np.nonzero(vector)[0]
    if len(non_zero_indices) > 0:
        print(f"  Non-zero weights:")
        for idx in non_zero_indices:
            term = vocabulary[idx]
            weight = vector[idx]
            print(f"    {term:20s} -> {weight:.4f}")
    else:
        print(f"  All zeros")
    
    print("-" * 100)

Document Vectors (first 3 documents):


d1:
  Shape: (68,)
  Non-zero elements: 5
  Non-zero weights:
    cat                  -> 0.5740
    chase                -> 1.1761
    garden               -> 1.1761
    mous                 -> 0.8751
    small                -> 1.1761
----------------------------------------------------------------------------------------------------

d2:
  Shape: (68,)
  Non-zero elements: 5
  Non-zero weights:
    dog                  -> 0.5740
    fetch                -> 1.1761
    friendli             -> 1.1761
    play                 -> 1.1761
    river                -> 1.1761
----------------------------------------------------------------------------------------------------

d3:
  Shape: (68,)
  Non-zero elements: 7
  Non-zero weights:
    bm25                 -> 0.6990
    engin                -> 1.1761
    function             -> 1.1761
    rank                 -> 1.1761
    search               -> 0.8751
    use                  -> 0.5740
    wide  

# Preprocessor used again for query processing

In [38]:
preprocessor = DocumentPreprocessor()

def process_query(query: str, 
                 vocabulary: List[str], 
                 idf: Dict[str, float],
                 preprocessor: DocumentPreprocessor) -> Tuple[np.ndarray, List[str]]:
    """
    Process query and convert to TF-IDF vector
    Steps:
    1. Preprocess query (lowercase, remove punctuation, stopwords, stemming)
    2. Calculate term frequency
    3. Normalize TF using 1 + log10(tf)
    4. Multiply by IDF
    5. Create query vector with same dimensions as document vectors
    
    Returns:
        Tuple[np.ndarray, List[str]]: (query_vector, query_terms)
    """
    # Preprocess query
    query_terms = preprocessor.preprocess_text(query)
    print(f"Original query: {query}")
    print(f"Preprocessed query: {' '.join(query_terms)}")
    
    # Calculate term frequency in query
    query_tf = Counter(query_terms)
    
    # Calculate TF-IDF for query
    query_tfidf = {}
    for term in vocabulary:
        tf_raw = query_tf.get(term, 0)
        
        # Normalize TF
        if tf_raw > 0:
            tf = 1 + math.log10(tf_raw)
        else:
            tf = 0
        
        # Calculate TF-IDF
        query_tfidf[term] = tf * idf.get(term, 0)
    
    # Create query vector
    query_vector = np.array([query_tfidf[term] for term in vocabulary])
    
    return query_vector, query_terms

In [39]:
test_query = "library workshop"
query_vector, query_terms = process_query(test_query, vocabulary, idf, preprocessor)
print(f"\nQuery vector shape: {query_vector.shape}")
print(f"Non-zero elements in query vector: {np.count_nonzero(query_vector)}")

Original query: library workshop
Preprocessed query: librari workshop

Query vector shape: (68,)
Non-zero elements in query vector: 2


# Cosine Similarity

In [40]:
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """
    Calculate cosine similarity between two vectors
    cos_sim = (vec1 · vec2) / (||vec1|| * ||vec2||)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

In [47]:
def search_and_rank(query: str, 
                   vocabulary: List[str],
                   idf: Dict[str, float],
                   doc_vectors: Dict[str, np.ndarray],
                   preprocessor: DocumentPreprocessor,
                   documents: List[Dict[str, str]],
                   top_k: int = 5) -> List[Tuple[str, float]]:
    """
    Search and rank documents by cosine similarity with query
    """
    print(f"Searching for: {query}")
    
    # Process query to vector
    query_vector, query_terms = process_query(query, vocabulary, idf, preprocessor)
    
    if len(query_terms) == 0:
        print("⚠️  No valid terms after preprocessing")
        return []
    
    # Calculate cosine similarity with all documents
    similarities = []
    for doc_id, doc_vector in doc_vectors.items():
        sim = cosine_similarity(query_vector, doc_vector)
        similarities.append((doc_id, sim))
    
    # Sort by similarity (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Filter out zero similarities and get top k
    results = [(doc_id, sim) for doc_id, sim in similarities if sim > 0][:top_k]
    
    return results

# Test Search

In [49]:
test_queries = [
    "library workshop",
    "dog cat",
    "search retrieval",
    "bm25 ranking"
]

for query in test_queries:
    results = search_and_rank(query, vocabulary, idf, doc_vectors, preprocessor, documents, top_k=5)
    
    print(f"\nTop {len(results)} results:")
    if results:
        print(f"{'Rank':<6} {'Doc ID':<8} {'Score':<12} {'Content'}")
        print("-" * 80)
        for rank, (doc_id, score) in enumerate(results, 1):
            doc_content = next(d['contents'] for d in documents if d['id'] == doc_id)
            content_preview = doc_content[:60] + "..." if len(doc_content) > 60 else doc_content
            print(f"{rank:<6} {doc_id:<8} {score:<12.4f} {content_preview}")
    else:
        print("No matching documents found")
    print()

Searching for: library workshop
Original query: library workshop
Preprocessed query: librari workshop

Top 1 results:
Rank   Doc ID   Score        Content
--------------------------------------------------------------------------------
1      d8       0.6778       librari host workshop inform retriev

Searching for: dog cat
Original query: dog cat
Preprocessed query: dog cat

Top 5 results:
Rank   Doc ID   Score        Content
--------------------------------------------------------------------------------
1      d7       0.4386       dog cat slept couch
2      d15      0.3965       dog snif cat ignor mous
3      d1       0.1772       cat chase small mous garden
4      d2       0.1676       friendli dog play fetch river
5      d12      0.1676       dog loyal companion long hike

Searching for: search retrieval
Original query: search retrieval
Preprocessed query: search retriev

Top 4 results:
Rank   Doc ID   Score        Content
---------------------------------------------------------

# Interactive query !!!

In [43]:
def interactive_search():
    """Interactive search interface"""
    print("\n" + "=" * 80)
    print("🔎 TF-IDF Cosine Similarity Search")
    print("=" * 80)
    print("Commands:")
    print("  - Enter a query to search")
    print("  - Type 'quit' or 'exit' to stop")
    print("=" * 80 + "\n")
    
    while True:
        query = input("Enter query: ").strip()
        
        if query.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        if not query:
            print("⚠️  Please enter a query\n")
            continue
        
        results = search_and_rank(query, vocabulary, idf, doc_vectors, preprocessor, documents, top_k=10)
        
        print(f"\n📊 Found {len(results)} matching documents:")
        if results:
            print(f"{'Rank':<6} {'Doc ID':<8} {'Score':<12} {'Content'}")
            print("-" * 80)
            for rank, (doc_id, score) in enumerate(results, 1):
                doc_content = next(d['contents'] for d in documents if d['id'] == doc_id)
                print(f"{rank:<6} {doc_id:<8} {score:<12.4f} {doc_content}")
        else:
            print("No matching documents found")
        print("\n" + "-" * 80 + "\n")

In [44]:
interactive_search()


🔎 TF-IDF Cosine Similarity Search
Commands:
  - Enter a query to search
  - Type 'quit' or 'exit' to stop

🔍 Searching for: dog
Original query: dog
Preprocessed query: dog

📊 Found 4 matching documents:
Rank   Doc ID   Score        Content
--------------------------------------------------------------------------------
1      d7       0.3102       dog cat slept couch
2      d15      0.2804       dog snif cat ignor mous
3      d2       0.2371       friendli dog play fetch river
4      d12      0.2371       dog loyal companion long hike

--------------------------------------------------------------------------------

⚠️  Please enter a query

🔍 Searching for: Escape
Original query: Escape
Preprocessed query: escap

📊 Found 0 matching documents:
No matching documents found

--------------------------------------------------------------------------------

⚠️  Please enter a query

⚠️  Please enter a query

⚠️  Please enter a query

⚠️  Please enter a query

⚠️  Please enter a query

⚠️  