In [None]:
import re
import pandas as pd
import unicodedata
from collections import Counter

In [None]:

# ------------------------------
# 🔠 Unicode Normalization
# ------------------------------
def normalize_unicode(text):
    # Normalize to composed form (preserves diacritics like ṃ, ā, ñ, ṭ, etc.)
    return unicodedata.normalize('NFC', text)

# ------------------------------
# 1️⃣ WORD-BASED TOKENIZER (Preserving diacritics)
# ------------------------------
def tokenize_words(text):
    # Normalize first
    text = normalize_unicode(text)
    # Match full Pali diacritic range + Latin letters + hyphen/apostrophe
    pattern = r"[A-Za-zĀāĪīŪūṀṁṄṅÑñṬṭḌḍṆṇḶḷḺḻŚśṢṣḤḥ’']+"
    tokens = re.findall(pattern, text)
    tokens = [t.lower() for t in tokens if t.strip()]
    return tokens

# ------------------------------
# 2️⃣ SENTENCE-BASED TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    text = normalize_unicode(text)
    sentences = re.split(r'[.!?–]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3️⃣ DOCUMENT-BASED TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4️⃣ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    return Counter(tokens)

# ------------------------------
# 5️⃣ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6️⃣ WORD–DOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []

    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)

    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df


In [7]:

# ------------------------------
# 📄 Sample Roman Pali Docs
# ------------------------------
doc1 = """Evaṃ me sutaṃ – ekaṃ samayaṃ bhagavā rājagahe viharati jīvakassa komārabhaccassa ambavane."""
doc2 = """Rājā māgadho ajātasattu vedehiputto tadahuposathe pannarase komudiyā cātumāsiniyā puṇṇāya puṇṇamāya rattiyā."""

# Tokenize
docs = tokenize_documents([doc1, doc2])
docs_tokens = [tokenize_words(doc) for doc in docs]
sentences = tokenize_sentences(doc1)
freqs = [word_frequency(tokens) for tokens in docs_tokens]
sorted_freqs = [sort_by_frequency(f) for f in freqs]
matrix = build_word_document_matrix(docs_tokens)

print("🔹 Word Tokens (Doc1):", docs_tokens[0][:20], "...")
print("\n🔹 Sentence Tokens (Doc1):", sentences[:2])
print("\n🔹 Sorted Frequency (Doc1):", sorted_freqs[0][:10])
print("\n🔹 Word–Document Matrix:\n", matrix.head(10))

🔹 Word Tokens (Doc1): ['eva', 'me', 'suta', 'eka', 'samaya', 'bhagavā', 'rājagahe', 'viharati', 'jīvakassa', 'komārabhaccassa', 'ambavane'] ...

🔹 Sentence Tokens (Doc1): ['Evaṃ me sutaṃ – ekaṃ samayaṃ bhagavā rājagahe viharati jīvakassa komārabhaccassa ambavane']

🔹 Sorted Frequency (Doc1): [('eva', 1), ('me', 1), ('suta', 1), ('eka', 1), ('samaya', 1), ('bhagavā', 1), ('rājagahe', 1), ('viharati', 1), ('jīvakassa', 1), ('komārabhaccassa', 1)]

🔹 Word–Document Matrix:
                  Doc1  Doc2
ajātasattu          0     1
ambavane            1     0
bhagavā             1     0
cātumāsiniyā        0     1
eka                 1     0
eva                 1     0
jīvakassa           1     0
komudiyā            0     1
komārabhaccassa     1     0
me                  1     0


In [9]:
import re
import pandas as pd
import unicodedata
from collections import Counter

# ------------------------------
# 🔠 Unicode Normalization Fix
# ------------------------------
def normalize_unicode(text):
    text = unicodedata.normalize('NFKC', text)
    # Replace combining dot with proper 'ṃ' character
    text = re.sub(r"m[\u0307\u0323\u0310]", "ṃ", text)   # handle different dot forms
    text = re.sub(r"n[\u0307\u0323\u0310]", "ṇ", text)   # just in case for similar issue
    return text

# ------------------------------
# 1️⃣ WORD TOKENIZER (Roman Pali-safe)
# ------------------------------
def tokenize_words(text):
    text = normalize_unicode(text)
    pattern = r"[A-Za-zĀāĪīŪūṀṁṄṅÑñṬṭḌḍṆṇḶḷḺḻŚśṢṣḤḥṃ’']+"
    tokens = re.findall(pattern, text)
    tokens = [t.lower() for t in tokens if t.strip()]
    return tokens

# ------------------------------
# 2️⃣ SENTENCE TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    text = normalize_unicode(text)
    sentences = re.split(r'[.!?–]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3️⃣ DOCUMENT TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4️⃣ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    return Counter(tokens)

# ------------------------------
# 5️⃣ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6️⃣ WORD–DOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []

    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)

    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df


# ------------------------------
# 📄 Sample Roman Pali Docs
# ------------------------------
doc1 = """Evaṃ me sutaṃ – ekaṃ samayaṃ bhagavā rājagahe viharati jīvakassa komārabhaccassa ambavane."""
doc2 = """Rājā māgadho ajātasattu vedehiputto tadahuposathe pannarase komudiyā cātumāsiniyā puṇṇāya puṇṇamāya rattiyā."""

# Tokenize
docs = tokenize_documents([doc1, doc2])
docs_tokens = [tokenize_words(doc) for doc in docs]
sentences = tokenize_sentences(doc1)
freqs = [word_frequency(tokens) for tokens in docs_tokens]
sorted_freqs = [sort_by_frequency(f) for f in freqs]
matrix = build_word_document_matrix(docs_tokens)

print("🔹 Word Tokens (Doc1):", docs_tokens[0])
print("\n🔹 Sentence Tokens (Doc1):", sentences)
print("\n🔹 Sorted Frequency (Doc1):", sorted_freqs[0])
print("\n🔹 Word–Document Matrix:\n", matrix)


🔹 Word Tokens (Doc1): ['evaṃ', 'me', 'sutaṃ', 'ekaṃ', 'samayaṃ', 'bhagavā', 'rājagahe', 'viharati', 'jīvakassa', 'komārabhaccassa', 'ambavane']

🔹 Sentence Tokens (Doc1): ['Evaṃ me sutaṃ', 'ekaṃ samayaṃ bhagavā rājagahe viharati jīvakassa komārabhaccassa ambavane']

🔹 Sorted Frequency (Doc1): [('evaṃ', 1), ('me', 1), ('sutaṃ', 1), ('ekaṃ', 1), ('samayaṃ', 1), ('bhagavā', 1), ('rājagahe', 1), ('viharati', 1), ('jīvakassa', 1), ('komārabhaccassa', 1), ('ambavane', 1)]

🔹 Word–Document Matrix:
                  Doc1  Doc2
ajātasattu          0     1
ambavane            1     0
bhagavā             1     0
cātumāsiniyā        0     1
ekaṃ                1     0
evaṃ                1     0
jīvakassa           1     0
komudiyā            0     1
komārabhaccassa     1     0
me                  1     0
māgadho             0     1
pannarase           0     1
puṇṇamāya           0     1
puṇṇāya             0     1
rattiyā             0     1
rājagahe            1     0
rājā                0     1