In [1]:
import re
import pandas as pd
import unicodedata
from collections import Counter




# ------------------------------
# 🔠 Unicode Normalization Fix
# ------------------------------
def normalize_unicode(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"m[\u0307\u0323\u0310]", "ṃ", text)
    text = re.sub(r"n[\u0307\u0323\u0310]", "ṇ", text)
    return text

# ------------------------------
# 1️⃣ WORD TOKENIZER (Roman Pali-safe)
# ------------------------------
def tokenize_words(text):
    text = normalize_unicode(text)
    pattern = r"[A-Za-zĀāĪīŪūṀṁṄṅÑñṬṭḌḍṆṇḶḷḺḻŚśṢṣḤḥṃ’']+"
    tokens = re.findall(pattern, text)
    tokens = [t.lower() for t in tokens if t.strip()]
    return tokens

# ------------------------------
# 2️⃣ SENTENCE TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    text = normalize_unicode(text)
    sentences = re.split(r'[.!?–]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3️⃣ DOCUMENT TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4️⃣ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    return Counter(tokens)

# ------------------------------
# 5️⃣ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6️⃣ WORD–DOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []
    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)
    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df

# ------------------------------
# 7️⃣ LOAD DOCUMENTS (Ask User)
# ------------------------------
def load_documents_interactively():
    docs = []
    while True:
        num_files_input = input("📂 How many text documents do you want to load? ➤ ").strip()
        if not num_files_input.isdigit():
            print("⚠️ Please enter a valid number (e.g., 1, 2, 3).")
            continue
        num_files = int(num_files_input)
        break

    for i in range(num_files):
        file_path = input(f"👉 Enter path for document {i+1} (e.g., doc{i+1}.txt): ").strip()
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                docs.append(content)
        except FileNotFoundError:
            print(f"⚠️ File not found: {file_path}")
    return docs


# ------------------------------
# 🧾 EXPORT RESULTS
# ------------------------------
def export_results(doc_number, tokens, sentences, sorted_freq):
    df_freq = pd.DataFrame(sorted_freq, columns=['Word', 'Frequency'])
    df_tokens = pd.DataFrame(tokens, columns=['Token'])
    df_sentences = pd.DataFrame(sentences, columns=['Sentence'])

    # Export each as CSV
    df_freq.to_csv(f'doc{doc_number}_freq.csv', index=False, encoding='utf-8')
    df_tokens.to_csv(f'doc{doc_number}_tokens.csv', index=False, encoding='utf-8')
    df_sentences.to_csv(f'doc{doc_number}_sentences.csv', index=False, encoding='utf-8')
    print(f"📂 Exported results for Document {doc_number} ✅")

'''
# ------------------------------
# ⚙️ MAIN EXECUTION (Updated)
# ------------------------------
if __name__ == "__main__":
    print("🔸 Pali Roman Script Text Processor 🔸\n")

    docs = load_documents_interactively()

    if not docs:
        print("❌ No valid documents loaded. Exiting...")
    else:
        docs_tokens = [tokenize_words(doc) for doc in docs]
        docs_sentences = [tokenize_sentences(doc) for doc in docs]
        freqs = [word_frequency(tokens) for tokens in docs_tokens]
        sorted_freqs = [sort_by_frequency(f) for f in freqs]
        matrix = build_word_document_matrix(docs_tokens)

        # Process each document
        for i, (tokens, sentences) in enumerate(zip(docs_tokens, docs_sentences)):
            total_tokens = len(tokens)
            unique_tokens = len(set(tokens))
            top10 = sorted_freqs[i][:10]

            print(f"\n📘 Document {i+1} Summary:")
            print(f"   🔹 Total Tokens: {total_tokens}")
            print(f"   🔹 Unique Tokens: {unique_tokens}")

            print(f"   🔹 Top 10 Frequent Words: {top10}")
            print(f"   🔹 Example Sentences: {sentences[:3]}\n")

            # Export outputs
            export_results(i + 1, tokens, sentences, sorted_freqs[i])

        # Show preview of matrix
        print("\n🔹 Word–Document Matrix (Top 10):\n", matrix.head(10))
'''

'\n# ------------------------------\n# ⚙️ MAIN EXECUTION (Updated)\n# ------------------------------\nif __name__ == "__main__":\n    print("🔸 Pali Roman Script Text Processor 🔸\n")\n\n    docs = load_documents_interactively()\n\n    if not docs:\n        print("❌ No valid documents loaded. Exiting...")\n    else:\n        docs_tokens = [tokenize_words(doc) for doc in docs]\n        docs_sentences = [tokenize_sentences(doc) for doc in docs]\n        freqs = [word_frequency(tokens) for tokens in docs_tokens]\n        sorted_freqs = [sort_by_frequency(f) for f in freqs]\n        matrix = build_word_document_matrix(docs_tokens)\n\n        # Process each document\n        for i, (tokens, sentences) in enumerate(zip(docs_tokens, docs_sentences)):\n            total_tokens = len(tokens)\n            unique_tokens = len(set(tokens))\n            top10 = sorted_freqs[i][:10]\n\n            print(f"\n📘 Document {i+1} Summary:")\n            print(f"   🔹 Total Tokens: {total_tokens}")\n         

In [None]:
# ------------------------------
# ⚙️ MAIN EXECUTION (Updated)
# ------------------------------
if __name__ == "__main__":
    print("🔸 Pali Roman Script Text Processor 🔸\n")

    docs = load_documents_interactively()

    if not docs:
        print("❌ No valid documents loaded. Exiting...")
    else:
        docs_tokens = [tokenize_words(doc) for doc in docs]
        docs_sentences = [tokenize_sentences(doc) for doc in docs]
        freqs = [word_frequency(tokens) for tokens in docs_tokens]
        sorted_freqs = [sort_by_frequency(f) for f in freqs]
        matrix = build_word_document_matrix(docs_tokens)

        # Process each document
        for i, (tokens, sentences) in enumerate(zip(docs_tokens, docs_sentences)):
                total_tokens = len(tokens)
                unique_tokens = len(set(tokens))
                total_sentences = len(sentences)   # ✅ Count of sentences
                top10 = sorted_freqs[i][:10]

                print(f"\n📘 Document {i+1} Summary:")
                print(f"   🔹 Total Tokens: {total_tokens}")
                print(f"   🔹 Unique Tokens: {unique_tokens}")      # Unique word tokens
                print(f"   🔹 Total Sentences: {total_sentences}") # Sentence tokens count
                print(f"   🔹 Top 10 Frequent Words: {top10}")
                print(f"   🔹 Example Sentences: {sentences[:3]}\n")

                # Export outputs
                export_results(i + 1, tokens, sentences, sorted_freqs[i])

        # Show preview of matrix
        print("\n🔹 Word–Document Matrix (Top 10):\n", matrix.head(10))

🔸 Pali Roman Script Text Processor 🔸



📂 How many text documents do you want to load? ➤  


⚠️ Please enter a valid number (e.g., 1, 2, 3).


📂 How many text documents do you want to load? ➤  


⚠️ Please enter a valid number (e.g., 1, 2, 3).


📂 How many text documents do you want to load? ➤  


⚠️ Please enter a valid number (e.g., 1, 2, 3).


📂 How many text documents do you want to load? ➤  


⚠️ Please enter a valid number (e.g., 1, 2, 3).


📂 How many text documents do you want to load? ➤  no


⚠️ Please enter a valid number (e.g., 1, 2, 3).
