In [1]:
import re
import pandas as pd
from collections import Counter

In [9]:
# ------------------------------
# 1️⃣ WORD-BASED TOKENIZER (Improved for Devanagari/Pali)
# ------------------------------
def tokenize_words(text):
    # Match sequences of Devanagari letters + diacritics + numbers
    # tokens = re.findall(r'[अ-ह़-ॉ़्ँ]+', text)
    tokens = re.findall(r'[अ-हऀ-ॣ़़ऽ-ऽँंः]+', text)

    return tokens

# ------------------------------
# 2️⃣ SENTENCE-BASED TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    # Split by danda '।', double danda '॥', or standard punctuation
    sentences = re.split(r'[।॥.!?…]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3️⃣ DOCUMENT-BASED TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4️⃣ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    freq_dict = Counter(tokens)
    return freq_dict

# ------------------------------
# 5️⃣ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6️⃣ WORD–DOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []

    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)

    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df


In [10]:
# ------------------------------
# Sample raw Pali texts (unclean)
# ------------------------------
doc1 = "एवं मे सुतं – एकं समयं भगवा राजगहे विहरति। तेन समयेन..."
doc2 = "भिक्खुसङ्घेन सद्धिं अड्ढतेळसेहि भिक्खुसतेहि॥ एवं मे सुतं।"

# Treat each doc as one document
docs = tokenize_documents([doc1, doc2])

# Tokenize words in each document
docs_tokens = [tokenize_words(doc) for doc in docs]

# Sentence-level tokens
sentences = tokenize_sentences(doc1)

# Frequency + Matrix
freqs = [word_frequency(tokens) for tokens in docs_tokens]
sorted_freqs = [sort_by_frequency(f) for f in freqs]
matrix = build_word_document_matrix(docs_tokens)

print("🔹 Word Tokens (Doc1):", docs_tokens[0])
print("\n🔹 Sentence Tokens (Doc1):", sentences)
print("\n🔹 Sorted Frequency (Doc1):", sorted_freqs[0])
print("\n🔹 Word–Document Matrix:\n", matrix)


🔹 Word Tokens (Doc1): ['एवं', 'मे', 'सुतं', 'एकं', 'समयं', 'भगवा', 'राजगहे', 'विहरति', 'तेन', 'समयेन']

🔹 Sentence Tokens (Doc1): ['एवं मे सुतं – एकं समयं भगवा राजगहे विहरति', 'तेन समयेन']

🔹 Sorted Frequency (Doc1): [('एवं', 1), ('मे', 1), ('सुतं', 1), ('एकं', 1), ('समयं', 1), ('भगवा', 1), ('राजगहे', 1), ('विहरति', 1), ('तेन', 1), ('समयेन', 1)]

🔹 Word–Document Matrix:
               Doc1  Doc2
अड्ढतेळसेहि      0     1
एकं              1     0
एवं              1     1
तेन              1     0
भगवा             1     0
भिक्खुसङ्घेन     0     1
भिक्खुसतेहि      0     1
मे               1     1
राजगहे           1     0
विहरति           1     0
सद्धिं           0     1
समयं             1     0
समयेन            1     0
सुतं             1     1
