In [9]:
from docx import Document
import json
import re
import os


def process_vietnamese_regulation_docx(file_path, output_folder):
    """Process Vietnamese regulatory DOCX files and return JSON file path only"""
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Base filename without extension
    base_name = os.path.basename(file_path).split('.')[0]

    # Load the document
    doc = Document(file_path)

    # Extract document metadata and structure
    document_data = {
        "title": "",
        "document_number": "",
        "date": "",
        "issuer": "",
        "sections": [],
        "articles": [],
        "tables": []
    }

    # Extract text with formatting hints
    current_article = None
    current_section = None

    # Process title and metadata from early paragraphs
    for i, para in enumerate(doc.paragraphs[:10]):  # Check first few paragraphs for metadata
        text = para.text.strip()
        if not text:
            continue

        # Try to identify document number
        if "Số:" in text and document_data["document_number"] == "":
            document_data["document_number"] = text

        # Try to identify title - usually in bold and/or all caps
        elif text.isupper() and "QUYẾT ĐỊNH" in text and document_data["title"] == "":
            document_data["title"] = text

        # Try to identify issuer - usually all caps with "TRƯỞNG"
        elif "TRƯỞNG" in text and text.isupper() and document_data["issuer"] == "":
            document_data["issuer"] = text

    # Process all paragraphs for content
    for i, para in enumerate(doc.paragraphs):
        text = para.text.strip()
        if not text:
            continue

        # Try to identify articles (Điều)
        article_match = re.match(r'^Điều\s+(\d+)\.\s*(.*)', text)
        if article_match:
            article_num = article_match.group(1)
            article_title = article_match.group(2)
            current_article = {
                "number": article_num,
                "title": article_title,
                "content": []
            }
            document_data["articles"].append(current_article)

        # Try to identify sections with numbering patterns common in Vietnamese regulations
        section_match = re.match(r'^(\d+\.(?:\d+\.?)*)\s*(.*)', text)
        if section_match and not article_match:  # Avoid matching articles again
            section_num = section_match.group(1)
            section_text = section_match.group(2)
            current_section = {
                "number": section_num,
                "text": section_text,
                "level": len(section_num.split('.')) - 1  # Determine hierarchy level
            }
            document_data["sections"].append(current_section)

        # Add content to current article if we're in one
        if current_article and not article_match:  # Don't add the article header itself
            current_article["content"].append(text)

    # Process tables
    for i, table in enumerate(doc.tables):
        table_data = []
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)

        # Save table with context
        document_data["tables"].append({
            "table_id": i + 1,
            "data": table_data
        })

    # Save as structured JSON
    json_path = os.path.join(output_folder, f"{base_name}.json")
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(document_data, f, ensure_ascii=False, indent=2)

    return json_path


# Usage
file_path = "D:\\LLM_From_Scratch\\data\\1.-QD-967-vv-ban-hanh-Quy-che-CTSV-truong-DHQT-Signed-4-da-gop.docx"
json_file = process_vietnamese_regulation_docx(file_path, "D:\\LLM_From_Scratch\\processed_data")

print(f"JSON file saved to: {json_file}")

JSON file saved to: D:\LLM_From_Scratch\processed_data\1.json


In [10]:
file_path="D:\\LLM_From_Scratch\\data\\3.-Phu-luc-1-30122022-Signed-2.docx"
json_file = process_vietnamese_regulation_docx(file_path, "D:\\LLM_From_Scratch\\processed_data")
print(f"JSON file saved to: {json_file}")

JSON file saved to: D:\LLM_From_Scratch\processed_data\3.json


In [11]:
file_path="D:\\LLM_From_Scratch\\data\\4.-Phu-luc-II-Tieu-chi-va-Khung-DRL-Signed-3.docx"
json_file = process_vietnamese_regulation_docx(file_path, "D:\\LLM_From_Scratch\\processed_data")
print(f"JSON file saved to: {json_file}")

JSON file saved to: D:\LLM_From_Scratch\processed_data\4.json
