In [10]:
import re
import pandas as pd
import unicodedata
from collections import Counter


In [11]:

# ------------------------------
# 🔠 Unicode Normalization Fix
# ------------------------------
def normalize_unicode(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"m[\u0307\u0323\u0310]", "ṃ", text)
    text = re.sub(r"n[\u0307\u0323\u0310]", "ṇ", text)
    return text

# ------------------------------
# 1️⃣ WORD TOKENIZER (Roman Pali-safe)
# ------------------------------
def tokenize_words(text):
    text = normalize_unicode(text)
    pattern = r"[A-Za-zĀāĪīŪūṀṁṄṅÑñṬṭḌḍṆṇḶḷḺḻŚśṢṣḤḥṃ’']+"
    tokens = re.findall(pattern, text)
    tokens = [t.lower() for t in tokens if t.strip()]
    return tokens

# ------------------------------
# 2️⃣ SENTENCE TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    text = normalize_unicode(text)
    sentences = re.split(r'[.!?–]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3️⃣ DOCUMENT TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4️⃣ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    return Counter(tokens)

# ------------------------------
# 5️⃣ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6️⃣ WORD–DOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []
    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)
    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df

# ------------------------------
# 7️⃣ LOAD DOCUMENTS (Ask User)
# ------------------------------
def load_documents_interactively():
    docs = []
    num_files = int(input("📂 How many text documents do you want to load? ➤ "))

    for i in range(num_files):
        file_path = input(f"👉 Enter path for document {i+1} (e.g., doc{i+1}.txt): ").strip()
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                docs.append(content)
        except FileNotFoundError:
            print(f"⚠️ File not found: {file_path}")
    return docs


### error code

In [4]:

# ------------------------------
# ⚙️ MAIN EXECUTION
# ------------------------------
if __name__ == "__main__":
    print("🔸 Pali Roman Script Text Processor 🔸\n")

    # Load documents from user input
    docs = load_documents_interactively()

    if not docs:
        print("❌ No valid documents loaded. Exiting...")
    else:
        # Tokenize and analyze
        docs_tokens = [tokenize_words(doc) for doc in docs]
        sentences = tokenize_sentences(docs[0])
        freqs = [word_frequency(tokens) for tokens in docs_tokens]
        sorted_freqs = [sort_by_frequency(f) for f in freqs]
        matrix = build_word_document_matrix(docs_tokens)

        # Display results
        print("\n✅ Processing Completed!\n")
        print("🔹 Word Tokens (Doc1):", docs_tokens[0][:20], "...")
        print("\n🔹 Sentence Tokens (Doc1):", sentences[:3], "...")
        print("\n🔹 Sorted Frequency (Doc1):", sorted_freqs[0][:10])
        print("\n🔹 Word–Document Matrix:\n", matrix)


🔸 Pali Roman Script Text Processor 🔸



ValueError: invalid literal for int() with base 10: ''

## code after error

In [12]:

# ------------------------------
# 🔠 Unicode Normalization Fix
# ------------------------------
def normalize_unicode(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"m[\u0307\u0323\u0310]", "ṃ", text)
    text = re.sub(r"n[\u0307\u0323\u0310]", "ṇ", text)
    return text

# ------------------------------
# 1️⃣ WORD TOKENIZER (Roman Pali-safe)
# ------------------------------
def tokenize_words(text):
    text = normalize_unicode(text)
    pattern = r"[A-Za-zĀāĪīŪūṀṁṄṅÑñṬṭḌḍṆṇḶḷḺḻŚśṢṣḤḥṃ’']+"
    tokens = re.findall(pattern, text)
    tokens = [t.lower() for t in tokens if t.strip()]
    return tokens

# ------------------------------
# 2️⃣ SENTENCE TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    text = normalize_unicode(text)
    sentences = re.split(r'[.!?–]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3️⃣ DOCUMENT TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4️⃣ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    return Counter(tokens)

# ------------------------------
# 5️⃣ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6️⃣ WORD–DOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []
    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)
    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df

# ------------------------------
# 7️⃣ LOAD DOCUMENTS (Ask User)
# ------------------------------
def load_documents_interactively():
    docs = []
    while True:
        num_files_input = input("📂 How many text documents do you want to load? ➤ ").strip()
        if not num_files_input.isdigit():
            print("⚠️ Please enter a valid number (e.g., 1, 2, 3).")
            continue
        num_files = int(num_files_input)
        break

    for i in range(num_files):
        file_path = input(f"👉 Enter path for document {i+1} (e.g., doc{i+1}.txt): ").strip()
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                docs.append(content)
        except FileNotFoundError:
            print(f"⚠️ File not found: {file_path}")
    return docs


In [13]:

# ------------------------------
# ⚙️ MAIN EXECUTION
# ------------------------------
if __name__ == "__main__":
    print("🔸 Pali Roman Script Text Processor 🔸\n")

    # Load documents from user input
    docs = load_documents_interactively()

    if not docs:
        print("❌ No valid documents loaded. Exiting...")
    else:
        # Tokenize and analyze
        docs_tokens = [tokenize_words(doc) for doc in docs]
        sentences = tokenize_sentences(docs[0])
        freqs = [word_frequency(tokens) for tokens in docs_tokens]
        sorted_freqs = [sort_by_frequency(f) for f in freqs]
        matrix = build_word_document_matrix(docs_tokens)

        # Display results
        print("\n✅ Processing Completed!\n")
        print("🔹 Word Tokens (Doc1):", docs_tokens[0][:20], "...")
        print("\n🔹 Sentence Tokens (Doc1):", sentences[:3], "...")
        print("\n🔹 Sorted Frequency (Doc1):", sorted_freqs[0][:10])
        print("\n🔹 Word–Document Matrix:\n", matrix)


🔸 Pali Roman Script Text Processor 🔸


✅ Processing Completed!

🔹 Word Tokens (Doc1): ['sāmaññaphalasuttaṃ', 'rājāmaccakathā', 'evaṃ', 'me', 'sutaṃ', 'ekaṃ', 'samayaṃ', 'bhagavā', 'rājagahe', 'viharati', 'jīvakassa', 'komārabhaccassa', 'ambavane', 'mahatā', 'bhikkhusaṅghena', 'saddhiṃ', 'aḍḍhateḷasehi', 'bhikkhusatehi', 'tena', 'kho'] ...

🔹 Sentence Tokens (Doc1): ['2', 'Sāmaññaphalasuttaṃ\n\nRājāmaccakathā\n\n150', 'Evaṃ me sutaṃ'] ...

🔹 Sorted Frequency (Doc1): [('vā', 174), ('hoti', 127), ('kho', 120), ('mahārāja', 108), ('so', 81), ('ca', 73), ('bhante', 73), ('evaṃ', 69), ('cittaṃ', 68), ('pajānāti', 58)]

🔹 Word–Document Matrix:
                Doc1  Doc2
abalā             1     0
abbhantarānaṃ     4     0
abbhayena         2     0
abbhokāsaṃ        1     0
abbhokāso         1     0
...             ...   ...
ṭhitatto          1     0
ṭhite            17     0
ṭhito             3     0
’                 0    94
’’                0    48

[2231 rows x 2 columns]
