In [2]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os

def extract_text_from_pdf(pdf_path, output_folder, dpi=400, lang='vie'):
    """
    Extract text from a PDF file and save it to a text file.

    Args:
        pdf_path (str): Path to the input PDF file.
        output_folder (str): Folder to save the output text file.
        dpi (int): Resolution for converting PDF to images. Default is 400.
        lang (str): Language for OCR. Default is 'vie' (Vietnamese).
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Derive base name from the PDF file path
    base_name = os.path.basename(pdf_path).split('.')[0]

    # Convert PDF to images
    images = convert_from_path(pdf_path, dpi=dpi)

    # Extract text from each image
    text_pages = []
    for image in images:
        text = pytesseract.image_to_string(image, lang=lang)
        text_pages.append(text)

    # Write extracted text to the output file
    txt_path = os.path.join(output_folder, f"{base_name}.txt")
    with open(txt_path, "w", encoding="utf-8") as text_file:
        for page in text_pages:
            text_file.write(page + "\n\n")

    return txt_path

# Example usage
pdf_path = "D:\\LLM_From_Scratch\\data\\6.-QD-191_2011_Quy-dinh-che-do-chinh-sach.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\6.txt


In [3]:
pdf_path = "D:\\LLM_From_Scratch\\data\\7.-TB.486-MGHP-04112014-thong-bao-P.CTSV_.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\7.txt


In [5]:
pdf_path = "D:\\LLM_From_Scratch\\data\\8.-ND-81.signed.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\8.txt


In [6]:
pdf_path = "D:\\LLM_From_Scratch\\data\\10.-Noi-Quy-SV-truong-DHQT-cap-nhat-08-2009.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\10.txt


In [7]:
pdf_path = "D:\\LLM_From_Scratch\\data\\11.QD-586-QD-DHQT-_-Quy-tac-ung-xu-nguoi-hoc.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\11.txt


In [8]:
pdf_path = "D:\\LLM_From_Scratch\\data\\19.-2016-Quy-che-CTSV-Bo-GD-DT.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\19.txt


In [9]:
pdf_path = "D:\\LLM_From_Scratch\\data\\20.-2019-Quy-che-CTSV-DHQG-HCM.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\20.txt


In [10]:
pdf_path = "D:\\LLM_From_Scratch\\data\\22.-2019-Quy-che-DGKQRL-DHQG-HCM.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\22.txt


In [11]:
pdf_path = "D:\\LLM_From_Scratch\\data\\23.-QD1133_22815_VNU_ban-hanh-Quy-dinh-khen-thuong-HSSV-1.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\23.txt


In [12]:
pdf_path = "D:\\LLM_From_Scratch\\data\\24.-2013-Quy-che-CTSV-noi-tru.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\24.txt


In [13]:
pdf_path = "D:\\LLM_From_Scratch\\data\\24.-2013-Quy-che-CTSV-noi-tru.pdf"
output_folder = "D:\\LLM_From_Scratch\\processed_data"
txt_file = extract_text_from_pdf(pdf_path, output_folder)

print(f"Text file saved to: {txt_file}")

Text file saved to: D:\LLM_From_Scratch\processed_data\24.txt
