In [None]:
'''
B-POSITION / I-POSITION          # Chức vụ
B-REFERENCE / I-REFERENCE        # Căn cứ
B-ISSUING_AGENCY / I-ISSUING_AGENCY  # Cơ quan ban hành
B-DATE / I-DATE                  # Ngày ban hành 
B-DOC_TYPE / I-DOC_TYPE          # Loại văn bản
B-SIGNER / I-SIGNER              # Người ký
B-RECIPIENT / I-RECIPIENT        # Nơi nhận
B-DOC_NUMBER / I-DOC_NUMBER      # Số hiệu
B-SUMMARY / I-SUMMARY            # Trích yếu
O                               # Ngoài các thực thể
'''

In [None]:
from transformers import AutoTokenizer

# load văn bản từ file
with open("data/clear/24d.txt", "r", encoding="utf-8", errors="ignore") as f:
    text = f.read()

# chọn tokenizer (Electra/Vietnamese)
tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base", use_fast=True)

# mã hoá và đếm
encoding = tokenizer(text)
token_count = len(encoding["input_ids"])
print(f"Total tokens (subword) according to the tokenizer: {token_count}")

In [None]:
import fitz 
from pdfminer.high_level import extract_text as extract_text_pdfminer
from pdf2image import convert_from_path
import pytesseract

def extract_text_fitz(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        return "\n".join([page.get_text() for page in doc])
    except Exception as e:
        return e

def extract_text_pdfminer_safe(pdf_path):
    try:
        return extract_text_pdfminer(pdf_path)
    except Exception as e:
        return e

def extract_text_ocr(pdf_path, lang='vie'):
    try:
        pages = convert_from_path(pdf_path, dpi=300)
        text_list = []
        for i, page in enumerate(pages):
            text = pytesseract.image_to_string(page, lang=lang)
            text_list.append(text)
        return "\n".join(text_list)
    except Exception as e:
        return e

def robust_pdf_to_text(pdf_path):
    # text = extract_text_fitz(pdf_path)
    # if len(text.strip()) >= 500:
    #     return text

    # text = extract_text_pdfminer_safe(pdf_path)
    # if len(text.strip()) >= 500:
    #     return text

    text = extract_text_ocr(pdf_path)
    return text

if __name__ == '__main__':
    pdf_path = 'data/raw/'
    for i in range(2, 26):
        in_path = pdf_path + str(i)+ '.pdf'
        out_path = pdf_path + str(i) + '.txt'
        text = robust_pdf_to_text(in_path)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f'Success: {i}')

In [None]:
import json
from pathlib import Path

input_dir = Path("/home/hiwe/project/KeyValExtrator/data/clear") # thư mục chứa txt
output_dir = Path("/home/hiwe/project/KeyValExtrator/data/result")  # thư mục lưu json
output_dir.mkdir(exist_ok=True)

for txt_file in input_dir.glob("*.txt"):
    text = txt_file.read_text(encoding="utf-8").strip()
    record = {"text": text, "meta": {"filename": txt_file.name}}
    
    out_file = output_dir / (txt_file.stem + ".json")
    out_file.write_text(
        json.dumps([record], ensure_ascii=False, indent=2), encoding="utf-8"
    )
    print(f"✅ Đã tạo {out_file}")