In [1]:
import re
import pandas as pd
from collections import Counter

In [9]:
# ------------------------------
# 1Ô∏è‚É£ WORD-BASED TOKENIZER (Improved for Devanagari/Pali)
# ------------------------------
def tokenize_words(text):
    # Match sequences of Devanagari letters + diacritics + numbers
    # tokens = re.findall(r'[‡§Ö-‡§π‡§º-‡•â‡§º‡•ç‡§Å]+', text)
    tokens = re.findall(r'[‡§Ö-‡§π‡§Ä-‡•£‡§º‡§º‡§Ω-‡§Ω‡§Å‡§Ç‡§É]+', text)

    return tokens

# ------------------------------
# 2Ô∏è‚É£ SENTENCE-BASED TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    # Split by danda '‡•§', double danda '‡••', or standard punctuation
    sentences = re.split(r'[‡•§‡••.!?‚Ä¶]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3Ô∏è‚É£ DOCUMENT-BASED TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4Ô∏è‚É£ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    freq_dict = Counter(tokens)
    return freq_dict

# ------------------------------
# 5Ô∏è‚É£ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6Ô∏è‚É£ WORD‚ÄìDOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []

    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)

    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df


In [10]:
# ------------------------------
# Sample raw Pali texts (unclean)
# ------------------------------
doc1 = "‡§è‡§µ‡§Ç ‡§Æ‡•á ‡§∏‡•Å‡§§‡§Ç ‚Äì ‡§è‡§ï‡§Ç ‡§∏‡§Æ‡§Ø‡§Ç ‡§≠‡§ó‡§µ‡§æ ‡§∞‡§æ‡§ú‡§ó‡§π‡•á ‡§µ‡§ø‡§π‡§∞‡§§‡§ø‡•§ ‡§§‡•á‡§® ‡§∏‡§Æ‡§Ø‡•á‡§®..."
doc2 = "‡§≠‡§ø‡§ï‡•ç‡§ñ‡•Å‡§∏‡§ô‡•ç‡§ò‡•á‡§® ‡§∏‡§¶‡•ç‡§ß‡§ø‡§Ç ‡§Ö‡§°‡•ç‡§¢‡§§‡•á‡§≥‡§∏‡•á‡§π‡§ø ‡§≠‡§ø‡§ï‡•ç‡§ñ‡•Å‡§∏‡§§‡•á‡§π‡§ø‡•• ‡§è‡§µ‡§Ç ‡§Æ‡•á ‡§∏‡•Å‡§§‡§Ç‡•§"

# Treat each doc as one document
docs = tokenize_documents([doc1, doc2])

# Tokenize words in each document
docs_tokens = [tokenize_words(doc) for doc in docs]

# Sentence-level tokens
sentences = tokenize_sentences(doc1)

# Frequency + Matrix
freqs = [word_frequency(tokens) for tokens in docs_tokens]
sorted_freqs = [sort_by_frequency(f) for f in freqs]
matrix = build_word_document_matrix(docs_tokens)

print("üîπ Word Tokens (Doc1):", docs_tokens[0])
print("\nüîπ Sentence Tokens (Doc1):", sentences)
print("\nüîπ Sorted Frequency (Doc1):", sorted_freqs[0])
print("\nüîπ Word‚ÄìDocument Matrix:\n", matrix)


üîπ Word Tokens (Doc1): ['‡§è‡§µ‡§Ç', '‡§Æ‡•á', '‡§∏‡•Å‡§§‡§Ç', '‡§è‡§ï‡§Ç', '‡§∏‡§Æ‡§Ø‡§Ç', '‡§≠‡§ó‡§µ‡§æ', '‡§∞‡§æ‡§ú‡§ó‡§π‡•á', '‡§µ‡§ø‡§π‡§∞‡§§‡§ø', '‡§§‡•á‡§®', '‡§∏‡§Æ‡§Ø‡•á‡§®']

üîπ Sentence Tokens (Doc1): ['‡§è‡§µ‡§Ç ‡§Æ‡•á ‡§∏‡•Å‡§§‡§Ç ‚Äì ‡§è‡§ï‡§Ç ‡§∏‡§Æ‡§Ø‡§Ç ‡§≠‡§ó‡§µ‡§æ ‡§∞‡§æ‡§ú‡§ó‡§π‡•á ‡§µ‡§ø‡§π‡§∞‡§§‡§ø', '‡§§‡•á‡§® ‡§∏‡§Æ‡§Ø‡•á‡§®']

üîπ Sorted Frequency (Doc1): [('‡§è‡§µ‡§Ç', 1), ('‡§Æ‡•á', 1), ('‡§∏‡•Å‡§§‡§Ç', 1), ('‡§è‡§ï‡§Ç', 1), ('‡§∏‡§Æ‡§Ø‡§Ç', 1), ('‡§≠‡§ó‡§µ‡§æ', 1), ('‡§∞‡§æ‡§ú‡§ó‡§π‡•á', 1), ('‡§µ‡§ø‡§π‡§∞‡§§‡§ø', 1), ('‡§§‡•á‡§®', 1), ('‡§∏‡§Æ‡§Ø‡•á‡§®', 1)]

üîπ Word‚ÄìDocument Matrix:
               Doc1  Doc2
‡§Ö‡§°‡•ç‡§¢‡§§‡•á‡§≥‡§∏‡•á‡§π‡§ø      0     1
‡§è‡§ï‡§Ç              1     0
‡§è‡§µ‡§Ç              1     1
‡§§‡•á‡§®              1     0
‡§≠‡§ó‡§µ‡§æ             1     0
‡§≠‡§ø‡§ï‡•ç‡§ñ‡•Å‡§∏‡§ô‡•ç‡§ò‡•á‡§®     0     1
‡§≠‡§ø‡§ï‡•ç‡§ñ‡•Å‡§∏‡§§‡•á‡§π‡§ø      0     1
‡§Æ‡•á               1     1
‡§∞‡§æ‡§ú‡§ó‡§π‡•á           1     0
‡§µ‡§ø‡§π‡