In [1]:
import re
import pandas as pd
import unicodedata
from collections import Counter




# ------------------------------
# üî† Unicode Normalization Fix
# ------------------------------
def normalize_unicode(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"m[\u0307\u0323\u0310]", "·πÉ", text)
    text = re.sub(r"n[\u0307\u0323\u0310]", "·πá", text)
    return text

# ------------------------------
# 1Ô∏è‚É£ WORD TOKENIZER (Roman Pali-safe)
# ------------------------------
def tokenize_words(text):
    text = normalize_unicode(text)
    pattern = r"[A-Za-zƒÄƒÅƒ™ƒ´≈™≈´·πÄ·πÅ·πÑ·πÖ√ë√±·π¨·π≠·∏å·∏ç·πÜ·πá·∏∂·∏∑·∏∫·∏ª≈ö≈õ·π¢·π£·∏§·∏•·πÉ‚Äô']+"
    tokens = re.findall(pattern, text)
    tokens = [t.lower() for t in tokens if t.strip()]
    return tokens

# ------------------------------
# 2Ô∏è‚É£ SENTENCE TOKENIZER
# ------------------------------
def tokenize_sentences(text):
    text = normalize_unicode(text)
    sentences = re.split(r'[.!?‚Äì]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

# ------------------------------
# 3Ô∏è‚É£ DOCUMENT TOKENIZER
# ------------------------------
def tokenize_documents(texts):
    return texts

# ------------------------------
# 4Ô∏è‚É£ WORD FREQUENCY COUNTER
# ------------------------------
def word_frequency(tokens):
    return Counter(tokens)

# ------------------------------
# 5Ô∏è‚É£ SORT BY FREQUENCY
# ------------------------------
def sort_by_frequency(freq_dict):
    return sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

# ------------------------------
# 6Ô∏è‚É£ WORD‚ÄìDOCUMENT MATRIX
# ------------------------------
def build_word_document_matrix(docs_tokens):
    all_words = sorted(set(word for doc in docs_tokens for word in doc))
    matrix_data = []
    for word in all_words:
        row = [tokens.count(word) for tokens in docs_tokens]
        matrix_data.append(row)
    df = pd.DataFrame(matrix_data, index=all_words,
                      columns=[f'Doc{i+1}' for i in range(len(docs_tokens))])
    return df

# ------------------------------
# 7Ô∏è‚É£ LOAD DOCUMENTS (Ask User)
# ------------------------------
def load_documents_interactively():
    docs = []
    while True:
        num_files_input = input("üìÇ How many text documents do you want to load? ‚û§ ").strip()
        if not num_files_input.isdigit():
            print("‚ö†Ô∏è Please enter a valid number (e.g., 1, 2, 3).")
            continue
        num_files = int(num_files_input)
        break

    for i in range(num_files):
        file_path = input(f"üëâ Enter path for document {i+1} (e.g., doc{i+1}.txt): ").strip()
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                docs.append(content)
        except FileNotFoundError:
            print(f"‚ö†Ô∏è File not found: {file_path}")
    return docs


# ------------------------------
# üßæ EXPORT RESULTS
# ------------------------------
def export_results(doc_number, tokens, sentences, sorted_freq):
    df_freq = pd.DataFrame(sorted_freq, columns=['Word', 'Frequency'])
    df_tokens = pd.DataFrame(tokens, columns=['Token'])
    df_sentences = pd.DataFrame(sentences, columns=['Sentence'])

    # Export each as CSV
    df_freq.to_csv(f'doc{doc_number}_freq.csv', index=False, encoding='utf-8')
    df_tokens.to_csv(f'doc{doc_number}_tokens.csv', index=False, encoding='utf-8')
    df_sentences.to_csv(f'doc{doc_number}_sentences.csv', index=False, encoding='utf-8')
    print(f"üìÇ Exported results for Document {doc_number} ‚úÖ")

'''
# ------------------------------
# ‚öôÔ∏è MAIN EXECUTION (Updated)
# ------------------------------
if __name__ == "__main__":
    print("üî∏ Pali Roman Script Text Processor üî∏\n")

    docs = load_documents_interactively()

    if not docs:
        print("‚ùå No valid documents loaded. Exiting...")
    else:
        docs_tokens = [tokenize_words(doc) for doc in docs]
        docs_sentences = [tokenize_sentences(doc) for doc in docs]
        freqs = [word_frequency(tokens) for tokens in docs_tokens]
        sorted_freqs = [sort_by_frequency(f) for f in freqs]
        matrix = build_word_document_matrix(docs_tokens)

        # Process each document
        for i, (tokens, sentences) in enumerate(zip(docs_tokens, docs_sentences)):
            total_tokens = len(tokens)
            unique_tokens = len(set(tokens))
            top10 = sorted_freqs[i][:10]

            print(f"\nüìò Document {i+1} Summary:")
            print(f"   üîπ Total Tokens: {total_tokens}")
            print(f"   üîπ Unique Tokens: {unique_tokens}")

            print(f"   üîπ Top 10 Frequent Words: {top10}")
            print(f"   üîπ Example Sentences: {sentences[:3]}\n")

            # Export outputs
            export_results(i + 1, tokens, sentences, sorted_freqs[i])

        # Show preview of matrix
        print("\nüîπ Word‚ÄìDocument Matrix (Top 10):\n", matrix.head(10))
'''

'\n# ------------------------------\n# ‚öôÔ∏è MAIN EXECUTION (Updated)\n# ------------------------------\nif __name__ == "__main__":\n    print("üî∏ Pali Roman Script Text Processor üî∏\n")\n\n    docs = load_documents_interactively()\n\n    if not docs:\n        print("‚ùå No valid documents loaded. Exiting...")\n    else:\n        docs_tokens = [tokenize_words(doc) for doc in docs]\n        docs_sentences = [tokenize_sentences(doc) for doc in docs]\n        freqs = [word_frequency(tokens) for tokens in docs_tokens]\n        sorted_freqs = [sort_by_frequency(f) for f in freqs]\n        matrix = build_word_document_matrix(docs_tokens)\n\n        # Process each document\n        for i, (tokens, sentences) in enumerate(zip(docs_tokens, docs_sentences)):\n            total_tokens = len(tokens)\n            unique_tokens = len(set(tokens))\n            top10 = sorted_freqs[i][:10]\n\n            print(f"\nüìò Document {i+1} Summary:")\n            print(f"   üîπ Total Tokens: {total_to

In [None]:
# ------------------------------
# ‚öôÔ∏è MAIN EXECUTION (Updated)
# ------------------------------
if __name__ == "__main__":
    print("üî∏ Pali Roman Script Text Processor üî∏\n")

    docs = load_documents_interactively()

    if not docs:
        print("‚ùå No valid documents loaded. Exiting...")
    else:
        docs_tokens = [tokenize_words(doc) for doc in docs]
        docs_sentences = [tokenize_sentences(doc) for doc in docs]
        freqs = [word_frequency(tokens) for tokens in docs_tokens]
        sorted_freqs = [sort_by_frequency(f) for f in freqs]
        matrix = build_word_document_matrix(docs_tokens)

        # Process each document
        for i, (tokens, sentences) in enumerate(zip(docs_tokens, docs_sentences)):
                total_tokens = len(tokens)
                unique_tokens = len(set(tokens))
                total_sentences = len(sentences)   # ‚úÖ Count of sentences
                top10 = sorted_freqs[i][:10]

                print(f"\nüìò Document {i+1} Summary:")
                print(f"   üîπ Total Tokens: {total_tokens}")
                print(f"   üîπ Unique Tokens: {unique_tokens}")      # Unique word tokens
                print(f"   üîπ Total Sentences: {total_sentences}") # Sentence tokens count
                print(f"   üîπ Top 10 Frequent Words: {top10}")
                print(f"   üîπ Example Sentences: {sentences[:3]}\n")

                # Export outputs
                export_results(i + 1, tokens, sentences, sorted_freqs[i])

        # Show preview of matrix
        print("\nüîπ Word‚ÄìDocument Matrix (Top 10):\n", matrix.head(10))

üî∏ Pali Roman Script Text Processor üî∏



üìÇ How many text documents do you want to load? ‚û§  


‚ö†Ô∏è Please enter a valid number (e.g., 1, 2, 3).


üìÇ How many text documents do you want to load? ‚û§  


‚ö†Ô∏è Please enter a valid number (e.g., 1, 2, 3).


üìÇ How many text documents do you want to load? ‚û§  


‚ö†Ô∏è Please enter a valid number (e.g., 1, 2, 3).


üìÇ How many text documents do you want to load? ‚û§  


‚ö†Ô∏è Please enter a valid number (e.g., 1, 2, 3).


üìÇ How many text documents do you want to load? ‚û§  no


‚ö†Ô∏è Please enter a valid number (e.g., 1, 2, 3).
