In [2]:
import re
import math
from collections import Counter

In [3]:
# Preprocess / normalize Khmer text

def normalize_khmer_spaces(text):
    """Replace invisible spaces and normalize multiple spaces"""
    text = text.replace('\u200b', ' ')  # zero-width space
    text = text.replace('\xa0', ' ')    # non-breaking space
    text = text.replace('\u200c', ' ')  # zero-width non-joiner
    text = text.replace('\u200d', ' ')  # zero-width joiner
    text = re.sub(r'\s+', ' ', text)    # replace multiple spaces with single space
    return text.strip()

def load_documents(path):
    with open(path, "r", encoding="utf-8") as f:
        docs = [line.strip() for line in f if line.strip()]
    # Normalize spaces
    docs = [normalize_khmer_spaces(doc) for doc in docs]
    return docs

docs_with_sw = load_documents("dataset_with_stopwords.txt")
docs_without_sw = load_documents("dataset_remove_stopwords.txt")



In [4]:
# Tokenize documents (split by space)
tokenized_docs_with_sw = [doc.split(" ") for doc in docs_with_sw]
tokenized_docs_without_sw = [doc.split(" ") for doc in docs_without_sw]

In [5]:
def compute_tf(doc_tokens):
    """Compute term frequency for a single document"""
    tf = {}
    word_count = Counter(doc_tokens)
    total_words = len(doc_tokens)
    for word, count in word_count.items():
        tf[word] = count / total_words
    return tf

tf_docs_with_sw = [compute_tf(doc) for doc in tokenized_docs_with_sw]
tf_docs_without_sw = [compute_tf(doc) for doc in tokenized_docs_without_sw]

In [6]:
#  Optimized IDF computation
def compute_idf_fast(tokenized_docs):
    """Compute IDF using Counter (single pass, much faster)"""
    N = len(tokenized_docs)
    df_counter = Counter()

    # Count in how many documents each word appears
    for doc in tokenized_docs:
        unique_words = set(doc)  # only count each word once per doc
        df_counter.update(unique_words)

    # Compute IDF
    idf = {word: math.log(N / (df_counter[word] + 1)) for word in df_counter}
    return idf

idf_with_sw = compute_idf_fast(tokenized_docs_with_sw)
idf_without_sw = compute_idf_fast(tokenized_docs_without_sw)

In [7]:
def compute_tfidf(tf_doc, idf):
    """Compute TF-IDF for a single document"""
    return {word: tf_value * idf.get(word, 0) for word, tf_value in tf_doc.items()}

tfidf_docs_with_sw = [compute_tfidf(tf, idf_with_sw) for tf in tf_docs_with_sw]
tfidf_docs_without_sw = [compute_tfidf(tf, idf_without_sw) for tf in tf_docs_without_sw]

In [8]:
#  Memory-efficient: Get top N words per document
def top_tfidf_words(tfidf_doc, top_n=10):
    """Return top N words and their TF-IDF scores"""
    return sorted(tfidf_doc.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Example: top 10 words for first document
print("Top words with stopwords (first doc):")
for w, s in top_tfidf_words(tfidf_docs_with_sw[0]):
    print(f"{w}: {s:.4f}")

print("\nTop words without stopwords (first doc):")
for w, s in top_tfidf_words(tfidf_docs_without_sw[0]):
    print(f"{w}: {s:.4f}")

Top words with stopwords (first doc):
ឥណ្ឌា: 0.3013
របៀបវារៈ: 0.1445
ជ្រុងជ្រោយ: 0.1444
ការធ្វើដំណើរ: 0.1366
កិច្ចពិភាក្សា: 0.1366
ការអញ្ជើញ: 0.1357
សារៈសំខាន់: 0.1269
សេចក្តីថ្លែងការណ៍: 0.1197
ទាំងមូល: 0.1172
ទស្សនកិច្ច: 0.1146

Top words without stopwords (first doc):
ឥណ្ឌា: 0.5214
របៀបវារៈ: 0.2501
ជ្រុងជ្រោយ: 0.2499
ការធ្វើដំណើរ: 0.2364
កិច្ចពិភាក្សា: 0.2364
ការអញ្ជើញ: 0.2349
សារៈសំខាន់: 0.2196
សេចក្តីថ្លែងការណ៍: 0.2072
ទាំងមូល: 0.2029
ទស្សនកិច្ច: 0.1983


The TF-IDF scores increase for meaningful content words after removing stopwords.

In [11]:
from collections import defaultdict

# Compute average TF-IDF for each word across corpus
def average_tfidf(tfidf_docs):
    word_sum = defaultdict(float)
    word_count = defaultdict(int)
    
    for doc in tfidf_docs:
        for word, score in doc.items():
            word_sum[word] += score
            word_count[word] += 1
    
    avg_tfidf = {w: word_sum[w] / word_count[w] for w in word_sum}
    return dict(sorted(avg_tfidf.items(), key=lambda x: x[1], reverse=True))

avg_with_sw = average_tfidf(tfidf_docs_with_sw)
avg_without_sw = average_tfidf(tfidf_docs_without_sw)

# Display top 20 words across corpus
print("Top 20 words WITH stopwords (corpus-level):")
for w, s in list(avg_with_sw.items())[:20]:
    print(f"{w}: {s:.4f}")

print("\nTop 20 words WITHOUT stopwords (corpus-level):")
for w, s in list(avg_without_sw.items())[:20]:
    print(f"{w}: {s:.4f}")


Top 20 words WITH stopwords (corpus-level):
ឯកច្ឆន្ច: 6.6530
អសុពលភាព: 6.6530
ហ្សូអ៊ី: 5.5442
គតេ: 4.4353
ឣនុធម្មចារី: 3.6961
វេទព្វជាតក: 3.6961
កិម្បក្កជាតក: 3.6961
សុទ្ធ់: 3.6961
សីហធម្មជាតក: 3.6961
កន្ថរ៉ា: 3.6961
ហ្វហ: 3.6961
វានរជាតក: 3.6961
បច្ចុប្បន្នៈ: 3.6961
ចូច: 3.6961
ពុទ្ធារី: 3.6961
វីតា: 3.6961
ឧទាយិវគ្គ: 3.6961
អត្ថទ: 3.6961
ព្រះតន្តិ: 3.6961
ព្រះតន្តី: 3.6961

Top 20 words WITHOUT stopwords (corpus-level):
អសុពលភាព: 11.0883
ប្រូតូកាទុ: 11.0883
ត្រូវៈ: 11.0883
ឯកច្ឆន្ច: 8.3162
អ្នកសង្កេតឃើញ: 5.5442
ទៅកាន់ខ្សែភាពយន្ត: 5.5442
សាលួត: 5.5442
ឃុយ: 5.5442
អឿង: 5.5442
ដូលីន: 5.5442
បទិគះ: 5.5442
ភេញ: 5.5442
យមកៈ: 5.5442
រូបរាងៈ: 5.5442
រ៉ាដូ: 5.5442
បឹងឆ្មារ: 5.5442
ហ្សូអ៊ី: 5.5442
ភេក្នុង: 5.5442
ប៉ូកាសុី: 5.5442
ព្រះពុទ្ធញ្ញាណ: 5.5442


##### integrate Khmer stop-word dataset

In [12]:
import csv

In [13]:
STOPWORD_GROUPS = {
    "Conjunctions",
    "Pronouns",
    "Determiners & Quantifiers",
    "Prepositions / Relational Words",
    "Auxiliary Verbs / Aspect Markers",
    "Particles & Discourse Markers",
    "Question & Negation Words",
    "Function Nouns",
    "Numbers & Time Expressions",
    "Politeness & Honorifics"
}


In [14]:
def load_stopwords_from_annotated_csv(csv_path):
    stopwords = set()

    with open(csv_path, encoding="utf-8-sig") as f:
        reader = csv.DictReader(f, delimiter=",")

        print("Detected columns:", reader.fieldnames)

        for row in reader:
            term = row["term"].strip()
            group = row["linguistic_group"].strip().lower()

            # Remove everything EXCEPT content words
            if "content word" not in group:
                stopwords.add(term)

    return stopwords


KHMER_STOPWORDS = load_stopwords_from_annotated_csv("FIle_Stopwords.csv")
print("Loaded Khmer stopwords:", len(KHMER_STOPWORDS))


Detected columns: ['term', 'linguistic_group']
Loaded Khmer stopwords: 615


In [15]:
KHMER_STOPWORDS

{'កន្លែងណាមួយ',
 'កម្រិត',
 'កាន់តែ',
 'កាន់តែច្រើន',
 'ការជ្រៀតជ្រែក',
 'ការបញ្ចប់',
 'ការបើកចំហ',
 'ការផ្លាស់ប្តូរ',
 'ការរៀបចំ',
 'ការសម្រេច',
 'កាល',
 'កាលណា',
 'កាលនោះ',
 'កាលបើ',
 'កាលពី',
 'កាលពីមុន',
 'កើត',
 'កំពុង',
 'ក៏',
 'ក៏ត្រូវ',
 'ក្នុង',
 'ក្បែរ',
 'ក្បែរនេះ',
 'ក្រោម',
 'ក្រោយ',
 'ខ្លួន',
 'គាត់',
 'គួរតែ',
 'គួរសម',
 'គួរឲ្យកត់សម្គាល់',
 'គេ',
 'គេប៉ុណ្ណោះ',
 'គ្មានកន្លែង',
 'គ្មាននរណា',
 'គ្រប់',
 'គ្រប់គ្នា',
 'គ្រាន់តែ',
 'ឃើញតែ',
 'ចង់',
 'ចាប់តាំងពី',
 'ចុះ',
 'ចូល',
 'ចូលទៅ',
 'ចូលរួម',
 'ចេញ',
 'ចេញពី',
 'ចំណែក',
 'ចំណែកឯ',
 'ចំនួន',
 'ចំនួនច្រើន',
 'ចំនួនបន្ថែម',
 'ចំពោះ',
 'ចំពោះបញ្ហា',
 'ច្បាស់ណាស់',
 'ច្បាស់លាស់',
 'ច្រើន',
 'ច្រើនជាងគេ',
 'ច្រើនណាស់',
 'ឆ្លាស់គ្នា',
 'ឆ្លាស់ទៅ',
 'ឆ្លើយ',
 'ឆ្លើយឆ្លង',
 'ឆ្លើយតប',
 'ឆ្លៀតកន្លែង',
 'ឆ្លៀតការបញ្ចប់',
 'ឆ្លៀតគ្នា',
 'ឆ្លៀតឃើញ',
 'ឆ្លៀតចាប់ផ្តើម',
 'ឆ្លៀតចូលរួម',
 'ឆ្លៀតចំណេញ',
 'ឆ្លៀតចំណែក',
 'ឆ្លៀតជាមួយនឹង',
 'ឆ្លៀតដាក់',
 'ឆ្លៀតតែងតាំង',
 'ឆ្លៀតតំរង់',
 'ឆ្លៀតត្រូវ',
 'ឆ្លៀតទៅ',
 'ឆ្លៀតធ្វើ',
 'ឆ្លៀតបង្កើត

In [16]:
# Remove stopwords from tokenized docs

def remove_stopwords(tokenized_docs, stopwords):
    """Return tokenized documents with stopwords removed"""
    return [[w for w in doc if w not in stopwords] for doc in tokenized_docs]

tokenized_docs_refined = remove_stopwords(tokenized_docs_with_sw, KHMER_STOPWORDS)



In [17]:
# Recompute TF, IDF, TF-IDF for refined docs

tf_docs_refined = [compute_tf(doc) for doc in tokenized_docs_refined]
idf_refined = compute_idf_fast(tokenized_docs_refined)
tfidf_docs_refined = [compute_tfidf(tf, idf_refined) for tf in tf_docs_refined]

avg_refined = average_tfidf(tfidf_docs_refined)

In [18]:
# Display comparison

def print_top_words(title, avg_tfidf, top_n=20):
    print(f"\n{title}")
    for w, s in list(avg_tfidf.items())[:top_n]:
        print(f"{w}: {s:.4f}")

print_top_words("Top 20 words WITH original stopwords", avg_with_sw)
print_top_words("Top 20 words WITHOUT initial stopwords", avg_without_sw)
print_top_words("Top 20 words AFTER applying Khmer stopword list", avg_refined)


Top 20 words WITH original stopwords
ឯកច្ឆន្ច: 6.6530
អសុពលភាព: 6.6530
ហ្សូអ៊ី: 5.5442
គតេ: 4.4353
ឣនុធម្មចារី: 3.6961
វេទព្វជាតក: 3.6961
កិម្បក្កជាតក: 3.6961
សុទ្ធ់: 3.6961
សីហធម្មជាតក: 3.6961
កន្ថរ៉ា: 3.6961
ហ្វហ: 3.6961
វានរជាតក: 3.6961
បច្ចុប្បន្នៈ: 3.6961
ចូច: 3.6961
ពុទ្ធារី: 3.6961
វីតា: 3.6961
ឧទាយិវគ្គ: 3.6961
អត្ថទ: 3.6961
ព្រះតន្តិ: 3.6961
ព្រះតន្តី: 3.6961

Top 20 words WITHOUT initial stopwords
អសុពលភាព: 11.0883
ប្រូតូកាទុ: 11.0883
ត្រូវៈ: 11.0883
ឯកច្ឆន្ច: 8.3162
អ្នកសង្កេតឃើញ: 5.5442
ទៅកាន់ខ្សែភាពយន្ត: 5.5442
សាលួត: 5.5442
ឃុយ: 5.5442
អឿង: 5.5442
ដូលីន: 5.5442
បទិគះ: 5.5442
ភេញ: 5.5442
យមកៈ: 5.5442
រូបរាងៈ: 5.5442
រ៉ាដូ: 5.5442
បឹងឆ្មារ: 5.5442
ហ្សូអ៊ី: 5.5442
ភេក្នុង: 5.5442
ប៉ូកាសុី: 5.5442
ព្រះពុទ្ធញ្ញាណ: 5.5442

Top 20 words AFTER applying Khmer stopword list
អសុពលភាព: 11.0883
ប្រូតូកាទុ: 11.0883
ឯកច្ឆន្ច: 8.3162
សាលួត: 5.5442
ឃុយ: 5.5442
អឿង: 5.5442
ដូលីន: 5.5442
បទិគះ: 5.5442
យមកៈ: 5.5442
រូបរាងៈ: 5.5442
រ៉ាដូ: 5.5442
បឹងឆ្មារ: 5.5442
ហ្សូអ៊ី: 5.5442
ប៉ូកាសុី: 5.5

The TF-IDF computation highlights the most statistically significant words in the corpus. As expected, the top TF-IDF words are content words, proper nouns, or domain-specific terms rather than grammatical stopwords (ex: អសុពលភាព, ប្រូតូកាទុ), because true stopwords appear in almost all documents, giving them very low IDF values.

In [19]:
# Step 5: Suggest candidate stopwords (optional)
# Words still high in TF-IDF but not content words

top_candidates = [w for w, s in list(avg_refined.items())[:50]]  # top 50
print("\nCandidate stopwords to consider adding (manual review):")
print(top_candidates)



Candidate stopwords to consider adding (manual review):
['អសុពលភាព', 'ប្រូតូកាទុ', 'ឯកច្ឆន្ច', 'សាលួត', 'ឃុយ', 'អឿង', 'ដូលីន', 'បទិគះ', 'យមកៈ', 'រូបរាងៈ', 'រ៉ាដូ', 'បឹងឆ្មារ', 'ហ្សូអ៊ី', 'ប៉ូកាសុី', 'ធម្មខក្ខន្ធ', 'គីស៊ុន', 'កកសស', 'បរិប័ន្ន', 'គតេ', 'ព្រះព្រហ្មាធិរាជ', 'ព្រងិល', 'ឣនុបាទិយានោ', 'ឣនុធម្មចារី', 'វេទព្វជាតក', 'កិម្បក្កជាតក', 'សុទ្ធ់', 'សីហធម្មជាតក', 'បច្ចេកវិ', 'កន្ថរ៉ា', 'ហ្វហ', 'ហ្គ្រីឡូ', 'សទ្ទកម្ម', 'វានរជាតក', 'ប្រវត្តិការងារ', 'បច្ចុប្បន្នៈ', 'អ្នកសង្កេតឃើញ', 'ដើជ្រៃ', 'ចូច', 'ពុទ្ធារី', 'វីតា', 'ឧទាយិវគ្គ', 'អត្ថទ', 'ព្រះតន្តិ', 'ព្រះតន្តី', 'ទៅកាន់ខ្សែភាពយន្ត', 'ដារ៉', 'ម៉ាញ់', 'ឡឿក', 'បាពហុ', 'សន្ធាយ']


In [None]:
# from collections import Counter

# def suggest_candidate_stopwords(tfidf_docs, top_n=50, min_doc_ratio=0.01):
#     """
#     Suggest candidate stopwords based on:
#     - Top N words by average TF-IDF
#     - Appear in at least `min_doc_ratio` fraction of documents
#     """
#     N_docs = len(tfidf_docs)
    
#     # Compute average TF-IDF
#     avg_tfidf = average_tfidf(tfidf_docs)
    
#     # Take top N words
#     top_words = list(avg_tfidf.keys())[:top_n]
    
#     # Count in how many documents each word appears
#     word_doc_count = Counter()
#     for doc in tfidf_docs:
#         for word in doc:
#             if word in top_words:
#                 word_doc_count[word] += 1
    
#     # Filter by document frequency ratio
#     candidates = [word for word, count in word_doc_count.items() if count / N_docs >= min_doc_ratio]
#     return candidates

# # Example
# candidate_stopwords_with_sw = suggest_candidate_stopwords(tfidf_docs_with_sw, top_n=50, min_doc_ratio=0.01)
# candidate_stopwords_without_sw = suggest_candidate_stopwords(tfidf_docs_without_sw, top_n=50, min_doc_ratio=0.01)

# print("Candidate stopwords (from WITH original stopwords):")
# print(candidate_stopwords_with_sw)

# print("\nCandidate stopwords (from WITHOUT initial stopwords):")
# print(candidate_stopwords_without_sw)


Candidate stopwords (from WITH original stopwords):
[]

Candidate stopwords (from WITHOUT initial stopwords):
[]


###### Simple IR Model Using TF-IDF and Top-K Comparison

In [21]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two sparse TF-IDF dicts"""
    common_words = set(vec1.keys()) & set(vec2.keys())
    num = sum(vec1[w] * vec2[w] for w in common_words)
    denom = (np.sqrt(sum(v**2 for v in vec1.values())) * 
             np.sqrt(sum(v**2 for v in vec2.values())))
    return num / denom if denom != 0 else 0

def rank_documents(query_vec, tfidf_docs, top_k=5):
    """Return top K document indices ranked by similarity"""
    scores = [(i, cosine_similarity(query_vec, doc)) for i, doc in enumerate(tfidf_docs)]
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

# Example: use first document as query
query_doc = tfidf_docs_with_sw[0]

top_k_with_sw = rank_documents(query_doc, tfidf_docs_with_sw, top_k=5)
top_k_without_sw = rank_documents(query_doc, tfidf_docs_without_sw, top_k=5)

print("Top 5 similar documents WITH stopwords:")
print(top_k_with_sw)

print("\nTop 5 similar documents WITHOUT stopwords:")
print(top_k_without_sw)

# Compare overlap
indices_with_sw = set(idx for idx, _ in top_k_with_sw)
indices_without_sw = set(idx for idx, _ in top_k_without_sw)
overlap = indices_with_sw & indices_without_sw
print(f"\nOverlap in Top-5 documents: {len(overlap)} / 5")


Top 5 similar documents WITH stopwords:
[(0, 1.0000000000000002), (20044, 0.3732776475978078), (39001, 0.3665936743440593), (83277, 0.35764972611312484), (62285, 0.3469664539784011)]

Top 5 similar documents WITHOUT stopwords:
[(0, 0.9036564461309989), (20044, 0.3591125468901498), (116781, 0.3560611253905191), (62285, 0.354057607202223), (83277, 0.3520607628912101)]

Overlap in Top-5 documents: 4 / 5


We tested the impact of stopword removal on a TF-IDF based IR model. The Top-5 document retrieval using cosine similarity showed 4/5 overlap between with and without stopwords, indicating that removing stopwords slightly changed the ranking. 

This confirms that the Khmer stopword removal helps reduce noise without significantly impacting the retrieval of relevant documents.