In [1]:
import requests
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

url_file = "pdf_reports_urls.txt"
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)

def fetch_pdf_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content  
    else:
        raise Exception(f"Failed to fetch {url}, status code: {response.status_code}")

def extract_text_from_pdf_bytes(pdf_bytes, doc_name):
    local_image_dir = os.path.join(output_dir, "images")
    image_writer = FileBasedDataWriter(local_image_dir)
    md_writer = FileBasedDataWriter(output_dir)

    ds = PymuDocDataset(pdf_bytes)

    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)
        pipe_result = infer_result.pipe_ocr_mode(image_writer)
    else:
        infer_result = ds.apply(doc_analyze, ocr=False)
        pipe_result = infer_result.pipe_txt_mode(image_writer)

    infer_result.draw_model(os.path.join(output_dir, f"{doc_name}_model.pdf"))
    pipe_result.draw_layout(os.path.join(output_dir, f"{doc_name}_layout.pdf"))
    pipe_result.draw_span(os.path.join(output_dir, f"{doc_name}_spans.pdf"))
    pipe_result.dump_md(md_writer, f"{doc_name}.md", "images")
    pipe_result.dump_content_list(md_writer, f"{doc_name}_content_list.json", "images")
    pipe_result.dump_middle_json(md_writer, f'{doc_name}_middle.json')

    # Get extracted markdown content (this is the actual extracted text)
    md_content = pipe_result.get_markdown("images")
    return "\n\n".join(md_content)

def main():
    with open(url_file, "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f.readlines() if line.strip()]

    for idx, url in enumerate(urls):
        doc_name = f"document_{idx+1}"
        txt_output_path = os.path.join(output_dir, f"{doc_name}.txt")

        try:
            print(f"Fetching PDF from {url}...")
            pdf_bytes = fetch_pdf_from_url(url)

            print(f"Extracting text from {doc_name} (in-memory)...")
            extracted_text = extract_text_from_pdf_bytes(pdf_bytes, doc_name)

            # Save extracted text into its own file
            with open(txt_output_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(extracted_text)

            print(f"Saved extracted text to {txt_output_path}\n")

        except Exception as e:
            print(f"Error processing {url}: {e}")

    print("Processing complete")

if __name__ == "__main__":
    main()


[93mimport tensorrt_llm failed, if do not use tensorrt, ignore this message[0m
[93mimport lmdeploy failed, if do not use lmdeploy, ignore this message[0m
Fetching PDF from https://www.responsibilityreports.com//Click/4101...


[32m2025-03-09 13:04:01.824[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m:[36m156[0m - [1mlang: None[0m


Extracting text from document_1 (in-memory)...


[32m2025-03-09 13:04:09.127[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m67[0m - [1mcid_count: 0, text_len: 9246, cid_chars_radio: 0.0[0m
[32m2025-03-09 13:04:09.144[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m78[0m - [1mDocAnalysis init, this may take some times, layout_model: doclayout_yolo, apply_formula: True, apply_ocr: False, apply_table: True, table_model: rapid_table, lang: None[0m
[32m2025-03-09 13:04:09.145[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m92[0m - [1musing device: cuda[0m
[32m2025-03-09 13:04:09.146[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m96[0m - [1musing models_dir: C:\Users\harry\.cache\huggingface\hub\models--opendatalab--PDF-Extract-Kit-1.0\snapshots\60416a2cabad3f7b7284b43ce37a99864484fba2/models[0m


CustomVisionEncoderDecoderModel init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
CustomMBartForCausalLM init
CustomMBartDecoder init


2025-03-09 13:04:33,044 - DownloadModel - DEBUG: C:\Users\harry\anaconda3\envs\MinerU\Lib\site-packages\rapid_table\models\slanet-plus.onnx already exists
[2025-03-09 13:04:33,044] [   DEBUG] download_model.py:34 - C:\Users\harry\anaconda3\envs\MinerU\Lib\site-packages\rapid_table\models\slanet-plus.onnx already exists
[32m2025-03-09 13:04:43.923[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m174[0m - [1mDocAnalysis init done![0m
[32m2025-03-09 13:04:43.923[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mcustom_model_init[0m:[36m128[0m - [1mmodel init cost: 34.79082369804382[0m
[32m2025-03-09 13:04:43.925[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mdoc_analyze[0m:[36m184[0m - [1mgpu_memory: 8 GB, batch_ratio: 2[0m
[32m2025-03-09 13:05:11.417[0m | [1mINFO    [0m | [36mmagic_pdf.model.batch_analyze[0m:[36m__call__[0m:[36m74[0m - [1mlayout time: 12

Saved extracted text to output\document_1.txt

Processing complete
