# Indonesian Document Retrieval System
**Objective:** Build an explainable document retrieval system (Indonesian) using ECS for stemming and VSM with TF-IDF and Cosine Similarity.  
This notebook demonstrates each preprocessing step, shows tables and intermediate outputs, and ranks documents for a sample query.

## Libraries used
- `os`, `glob` : file handling
- `PyPDF2`, `python-docx` : read `.pdf` and `.docx`
- `re` : tokenization
- `pandas`, `numpy`: tables and math
- `math` : log in IDF
- `sklearn.metrics.pairwise` (optional) : cosine similarity comparison

In [1]:
import os, glob, re, math
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
# pdf/docx readers
import PyPDF2
import docx

In [None]:
DATA_DIR = "dataset"  # update if needed

def read_txt(path):
    with open(path, encoding='utf-8', errors='ignore') as f:
        return f.read()

def read_pdf(path):
    text = []
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text.append(page.extract_text() or "")
    return "\n".join(text)

def read_docx(path):
    doc = docx.Document(path)
    return "\n".join(p.text for p in doc.paragraphs)

def load_documents(folder):
    readers = {'txt': read_txt, 'pdf': read_pdf, 'docx': read_docx}
    docs = {}
    for root, _, files in os.walk(folder):
        for fname in files:
            ext = fname.rsplit('.', 1)[-1].lower()
            if ext in readers:
                path = os.path.join(root, fname)
                try:
                    docs[os.path.relpath(path, folder)] = readers[ext](path)
                except Exception as e:
                    print(f"Failed reading {path}: {e}")
    return docs

# Load
documents = load_documents(DATA_DIR)
print(f"Loaded {len(documents)} documents. Sample filenames:")
for i, name in enumerate(list(documents)[:10], 1):
    print(i, name)
    

Loaded 0 documents. Sample filenames:


In [7]:
import os, glob
print("cwd:", os.getcwd())
print("DATA_DIR (abs):", os.path.abspath(DATA_DIR))
print("DATA_DIR exists?:", os.path.exists(DATA_DIR))
print("List root dataset folder:", os.listdir("dataset"))
print("Glob txt in DATA_DIR:", glob.glob(os.path.join(DATA_DIR, "*.txt"))[:10])
print("Glob all files recursively:", glob.glob(os.path.join("dataset", "**", "*.*"), recursive=True)[:20])

cwd: d:\Semester 5\Data Mining\tubes
DATA_DIR (abs): d:\Semester 5\Data Mining\tubes\dataset
DATA_DIR exists?: True
List root dataset folder: ['doc', 'pdf', 'texts']
Glob txt in DATA_DIR: []
Glob all files recursively: ['dataset\\doc\\1.docx', 'dataset\\doc\\241-SO-T6.docx', 'dataset\\doc\\2_Tugas_2-IFB354-1520193.docx', 'dataset\\doc\\2_Tugas_2-IFB354-1520193[1].docx', 'dataset\\doc\\AAA.docx', 'dataset\\doc\\Alamat akhir ram.docx', 'dataset\\doc\\Analisis Data Indeks SPBE Menggunakan OLAP untuk Evaluasi Kinerja Digitalisasi di Kabupaten.docx', 'dataset\\doc\\Api.docx', 'dataset\\doc\\BAB 1 PENDAHULUAN.docx', 'dataset\\doc\\basis data 2.docx', 'dataset\\doc\\basisdata_chap7.docx', 'dataset\\doc\\basisdata_chap8.docx', 'dataset\\doc\\CHAPTER 4_BASIS DATA.docx', 'dataset\\doc\\ciporeat.docx', 'dataset\\doc\\Cover Laporan Prak OOP 2025 (2).docx', 'dataset\\doc\\Doc1.docx', 'dataset\\doc\\Doc2.docx', 'dataset\\doc\\Dokumentasi Kuis Konfigurasi Jaringan VLAN.docx', 'dataset\\doc\\Dokumenta

In [None]:
sample_name, sample_text = next(iter(documents.items()))
print("Original (first 300 chars):\n", sample_text[:300])
print("\nLowercased:\n", sample_text[:300].lower()[:300])

In [None]:
def tokenize(text):
    text = text.lower()
    # remove non-alphanumeric (keep Indonesian letters)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = [t for t in text.split() if t.strip()]
    return tokens

tokens_example = tokenize(sample_text)[:50]
print("First 50 tokens:", tokens_example)

In [None]:
INDONESIAN_STOPWORDS = {
    # condensed sample list; extend as needed
    'yang','dan','di','ke','dari','ini','itu','pada','adalah','sebagai',
    'dengan','atau','oleh','untuk','karena','saat','kami','saya','kamu',
    'adanya','sebagai','telah','akan','adalah','atas','bawah','dpp'
}
# Show filtering:
tokens = tokenize(sample_text)[:100]
filtered = [t for t in tokens if t not in INDONESIAN_STOPWORDS]
print("Before (sample):", tokens[:30])
print("After filtered (sample):", filtered[:30])

## Enhanced Confix Stripping (ECS) â€” principle
ECS is rule-based: remove common Indonesian suffixes and prefixes (confixes) using a sequence of morphological rules (inflectional suffix stripping, derivational suffix stripping, prefix handling). For clarity we present a transparent implementation (note: production systems recommend using a lexicon check to verify roots).

In [None]:
def remove_particle_suffixes(word):
    # remove enclitics / particles
    return re.sub(r'(lah|kah|pun|nya|ku|mu)$', '', word)

def remove_derivational_suffixes(word):
    return re.sub(r'(i|kan|an)$', '', word)

def remove_prefixes(word):
    # simplified prefix handling following ECS-like rules with transformations
    w = word
    # handle meny- -> s + root (e.g., menyanyi -> nyanyi or s + vowel)
    w = re.sub(r'^meny([aiueo].*)$', r's\1', w)
    w = re.sub(r'^men([aiueo].*)$', r'\1', w)
    w = re.sub(r'^mem([aiueo].*)$', r'p\1', w)
    w = re.sub(r'^meng([aiueo].*)$', r'\1', w)
    # simple removal for common prefixes
    w = re.sub(r'^(di|ke|se|te|be|ber|pe|per|pel)', '', w)
    return w

def ecs_stem(word):
    original = word
    w = remove_particle_suffixes(word)
    w = remove_derivational_suffixes(w)
    w = remove_prefixes(w)
    # final cleanup
    w = re.sub(r'^(?:ku|kau)', '', w)  # possessives
    if w == '':
        w = original  # fallback to original to avoid empty
    return w

# Examples
examples = ["mengambil","makanan","bermain","permainan","membaca","penyanyi","menyanyi","memakan","pemakan"]
for ex in examples:
    print(f"{ex} -> {ecs_stem(ex)}")

In [None]:
def preprocess(text):
    tokens = tokenize(text)
    filtered = [t for t in tokens if t not in INDONESIAN_STOPWORDS]
    stems = [ecs_stem(t) for t in filtered]
    return {
        'tokens': tokens,
        'filtered': filtered,
        'stems': stems
    }

# Apply to all docs
doc_preproc = {}
for name, text in documents.items():
    doc_preproc[name] = preprocess(text)

# Show one document's pipeline
name = sample_name
print("Tokens (first 30):", doc_preproc[name]['tokens'][:30])
print("Filtered (first 30):", doc_preproc[name]['filtered'][:30])
print("Stems (first 30):", doc_preproc[name]['stems'][:30])

In [None]:
vocab = sorted({stem for d in doc_preproc.values() for stem in d['stems']})
print(f"Vocabulary size: {len(vocab)}")
print("Sample vocab terms:", vocab[:40])

In [None]:
# TF raw counts
tf = pd.DataFrame(0, index=sorted(documents.keys()), columns=vocab)
for doc, data in doc_preproc.items():
    c = Counter(data['stems'])
    for term, cnt in c.items():
        tf.loc[doc, term] = cnt

# Show table for first 10 terms and first 8 documents
display(tf.iloc[:8, :10])

IDF formula used: IDF(term) = log(N / df(term))  where N = number of documents, df = document frequency (docs containing term).
We use natural log for readability.

In [None]:
N = len(tf)
df = (tf > 0).sum(axis=0)
idf = df.apply(lambda d: math.log(N / d) if d > 0 else 0)
idf_df = pd.DataFrame({'df': df, 'idf': idf})
display(idf_df.sort_values('df').head(10))

In [None]:
tfidf = tf.copy().astype(float)
for term in tfidf.columns:
    tfidf[term] = tfidf[term] * idf[term]
# Show
display(tfidf.iloc[:8, :10])

In [None]:
query = "cara memasak ayam goreng"  # sample query in Indonesian
q_p = preprocess(query)
print("Query tokens:", q_p['tokens'])
print("Query filtered:", q_p['filtered'])
print("Query stems:", q_p['stems'])

In [None]:
def vectorize_stems(stems, vocab, idf_series):
    vec = np.zeros(len(vocab))
    c = Counter(stems)
    for i, term in enumerate(vocab):
        vec[i] = c[term] * idf_series.get(term, 0.0)
    return vec

query_vec = vectorize_stems(q_p['stems'], vocab, idf)
# Document vectors matrix
doc_matrix = tfidf.values  # docs x terms

# Cosine similarity
def cosine_sim(v1, v2):
    if np.linalg.norm(v1) == 0 or np.linalg.norm(v2) == 0:
        return 0.0
    return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

scores = {}
for i, doc in enumerate(tfidf.index):
    scores[doc] = cosine_sim(query_vec, doc_matrix[i])

# Show values
score_df = pd.DataFrame.from_dict(scores, orient='index', columns=['score']).sort_values('score', ascending=False)
display(score_df.head(20))

In [None]:
top_results = score_df[score_df['score'] > 0].sort_values('score', ascending=False)
print("Top ranked documents for query:", query)
display(top_results.head(10))

## Conclusion & Observations
- The pipeline shows step-by-step preprocessing: case folding, tokenization, stopword removal, ECS stemming.
- TF-IDF with cosine similarity provides a ranked list of documents matching semantic word stems.
- Limitations: Our ECS implementation is simplified and heuristic-based; production usage benefits from a lexical root-check or a more complete morphological analyzer.
- Next steps: add query expansion, use word embeddings, and add evaluation (precision/recall) with labelled queries.