In [1]:
%cd /home/nhan-softzone/crawl_law_airflow

/home/nhan-softzone/crawl_law_airflow


In [23]:
import sys
import os
from pathlib import Path

# Import các module cần thiết
from core.models.legal_document import LegalDocument  # LegalDocument model
from core.processers.legal_processor import LawDocumentProcessor  # Processor để kiểm tra
from core.database import DatabaseManager

class LawDocumentProcessorTest(LawDocumentProcessor):
    def __init__(self, doc: LegalDocument):
        self.doc = doc
        self.db = DatabaseManager()  # Sử dụng DatabaseManager thay vì SessionLocal

    def process(self):
        """Quy trình xử lý kiểm tra"""
        
        print(f"=== Đang xử lý document: {self.doc.id} ===")
            
        # Bước 1: Parse cấu trúc HTML
        parsed_structure = self._parse_html_structure(self.doc.content_html)
        #print("\n--- Cấu trúc HTML đã parse:")
        #self._print_structure(parsed_structure)
            
        # Bước 2: Chuẩn hóa cấu trúc
        normalized = self._normalize_structure(parsed_structure)
        '''print("\n--- Cấu trúc đã chuẩn hóa:")
        self._print_structure(normalized)'''
            
        # Bước 3: Trích xuất các Điều (Articles)
        articles = self._extract_articles(normalized)
        '''print("\n--- Các Điều (Articles) đã trích xuất:")
        for article in articles:
            print(f"Article ID: {article['article_id']}")
            print(f"Article Number: {article['number']}")
            print(f"Full Text: {article['full_text']}")
            print(f"Metadata: {article['metadata']}")
            print("-" * 30)'''
        
        self.db.close()  # Đảm bảo đóng kết nối
        return articles
    
    def _print_structure(self, structure, level=0):
        """Hàm hỗ trợ in cấu trúc phân cấp dưới dạng Markdown"""
        for node in structure:
            indent = "  " * level  # Two spaces per level for indentation
            line = f"{indent}- **{node['type']}**"  # Make type bold
            if node.get('number'):
                line += f" {node['number']}"
            line += f": {node['content']}"
            print(line)
            if "children" in node and node["children"]:
                self._print_structure(node["children"], level + 1)

In [24]:
def test_document_parser(document_number: str):
    with DatabaseManager() as db:
        document = db.session.query(LegalDocument).filter(
            LegalDocument.document_number == document_number,
            LegalDocument.content_html.isnot(None)
        ).first()

        if not document:
            print(f"No document found with document_number: {document_number}")
            return

        processor = LawDocumentProcessorTest(document)
        processor.process()

# Example usage:
# test_document_parser("YOUR_DOCUMENT_NUMBER_HERE")

In [32]:
with DatabaseManager() as db:
    document = db.session.query(LegalDocument).filter(
        LegalDocument.document_number == "2898/QĐ-UBND",
        LegalDocument.content_html.isnot(None)
    ).first()

    processor = LawDocumentProcessorTest(document)
    output = processor.process()

=== Đang xử lý document: 27960 ===
