In [1]:
from docx import Document
import os

def split_docx_to_txt(input_file, output_dir, words_per_file=3000):
    # Load the DOCX file
    doc = Document(input_file)
    
    # Extract all text
    full_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            full_text.append(para.text.strip())
    
    # Join paragraphs into one big text
    all_text = " ".join(full_text)
    
    # Split text into words
    words = all_text.split()
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Split into chunks of words_per_file
    file_count = 1
    for i in range(0, len(words), words_per_file):
        chunk_words = words[i:i + words_per_file]
        chunk_text = " ".join(chunk_words)
        
        output_path = os.path.join(output_dir, f"part_{file_count}.txt")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(chunk_text)
        
        print(f"Saved: {output_path} ({len(chunk_words)} words)")
        file_count += 1


# Example usage
if __name__ == "__main__":
    input_docx = "Books/G10-Ps-Tapfseer_extracted.docx"   # Replace with your Pashto docx filename
    output_folder = "output_txt"      # Folder where txt files will be saved
    
    split_docx_to_txt(input_docx, output_folder, words_per_file=3000)


Saved: output_txt\part_1.txt (3000 words)
Saved: output_txt\part_2.txt (3000 words)
Saved: output_txt\part_3.txt (3000 words)
Saved: output_txt\part_4.txt (3000 words)
Saved: output_txt\part_5.txt (3000 words)
Saved: output_txt\part_6.txt (3000 words)
Saved: output_txt\part_7.txt (3000 words)
Saved: output_txt\part_8.txt (3000 words)
Saved: output_txt\part_9.txt (3000 words)
Saved: output_txt\part_10.txt (3000 words)
Saved: output_txt\part_11.txt (3000 words)
Saved: output_txt\part_12.txt (3000 words)
Saved: output_txt\part_13.txt (3000 words)
Saved: output_txt\part_14.txt (3000 words)
Saved: output_txt\part_15.txt (3000 words)
Saved: output_txt\part_16.txt (3000 words)
Saved: output_txt\part_17.txt (3000 words)
Saved: output_txt\part_18.txt (1434 words)


In [1]:
from docx import Document
import os

def split_docx_to_txt(input_file, output_dir, words_per_file=3000):
    # Load the DOCX file
    doc = Document(input_file)

    # Extract paragraphs (keep empty lines between them)
    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    file_count = 1
    word_count = 0
    chunk_paragraphs = []

    for para in paragraphs:
        para_words = para.split()
        
        # If adding this paragraph exceeds the limit, save the current chunk
        if word_count + len(para_words) > words_per_file and chunk_paragraphs:
            output_path = os.path.join(output_dir, f"part_{file_count}.txt")
            with open(output_path, "w", encoding="utf-8") as f:
                f.write("\n\n".join(chunk_paragraphs))  # preserve paragraph breaks
            
            print(f"Saved: {output_path} ({word_count} words)")
            file_count += 1
            chunk_paragraphs = []
            word_count = 0

        # Add paragraph to the current chunk
        chunk_paragraphs.append(para)
        word_count += len(para_words)

    # Save any remaining paragraphs
    if chunk_paragraphs:
        output_path = os.path.join(output_dir, f"part_{file_count}.txt")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write("\n\n".join(chunk_paragraphs))
        print(f"Saved: {output_path} ({word_count} words)")


# Example usage
if __name__ == "__main__":
    input_docx = "Books/G10-Ps-Tapfseer_extracted.docx"   # Replace with your Pashto docx filename
    output_folder = "txtfile/output_txt/Tapfseer"      # Folder where txt files will be saved
    
    split_docx_to_txt(input_docx, output_folder, words_per_file=3000)


Saved: txtfile/output_txt/Tapfseer\part_1.txt (2826 words)
Saved: txtfile/output_txt/Tapfseer\part_2.txt (2713 words)
Saved: txtfile/output_txt/Tapfseer\part_3.txt (2719 words)
Saved: txtfile/output_txt/Tapfseer\part_4.txt (2934 words)
Saved: txtfile/output_txt/Tapfseer\part_5.txt (2782 words)
Saved: txtfile/output_txt/Tapfseer\part_6.txt (2684 words)
Saved: txtfile/output_txt/Tapfseer\part_7.txt (2825 words)
Saved: txtfile/output_txt/Tapfseer\part_8.txt (2676 words)
Saved: txtfile/output_txt/Tapfseer\part_9.txt (2936 words)
Saved: txtfile/output_txt/Tapfseer\part_10.txt (2790 words)
Saved: txtfile/output_txt/Tapfseer\part_11.txt (2881 words)
Saved: txtfile/output_txt/Tapfseer\part_12.txt (1676 words)
