In [3]:
import os
import yaml
import fitz
import pandas as pd

from tqdm import tqdm
from layout.layout_detect_yolo import LayoutDetectionYOLO

## Convert PDF to Images


In [3]:
file_name = "75YearsofEngineering"
input_pdf = f"inputs/{file_name}.pdf"
pdf = fitz.open(input_pdf)

if file_name not in os.listdir("images"):
    os.mkdir(f"images/{file_name}")

image_list = []

for page in tqdm(pdf, desc="Converting PDF to images"):
    path_out = f"images/{file_name}/{page.number}.png"

    if os.path.exists(path_out):
        image_list.append(path_out)
        continue

    pix = page.get_pixmap()
    pix.save(path_out)
    image_list.append(path_out)

Converting PDF to images: 100%|██████████| 301/301 [00:00<00:00, 4776.53it/s]


## Layout Prediction


In [4]:
with open('configs/layout_yolo.yaml') as f:
    config = yaml.safe_load(f)

layout_detector = LayoutDetectionYOLO(config["model_config"])
predictions = layout_detector.predict(image_list, f"results/{file_name}")

Predicting layout: 100%|██████████| 301/301 [04:21<00:00,  1.15it/s]


## Save Bounding Boxes


In [None]:
doc_bbs = []
doc_images = []

print(predictions[0].names)

for i, prediciton in enumerate(predictions):
    type_map = prediciton.names
    image_array = prediciton.orig_img
    doc_images.append({
        "id": f"{file_name}_{i}",
        "page": i,
        "image": image_array
    })

    for j, box in enumerate(prediciton.boxes):
        x1, y1, x2, y2 = map(int, box.xyxy.tolist()[0])

        doc_bbs.append({
            "id": f"{file_name}_{i}_{j}",
            "page": i,
            "xyxy": box.xyxy.tolist(),
            "image_array": image_array[y1:y2, x1:x2],
            "type": int(box.cls),
            "type_str": type_map[int(box.cls)],
            "type_conf": float(box.conf),
        })

df_bbs = pd.DataFrame(doc_bbs)
df_bbs.to_pickle(f"results/{file_name}_bbs.pkl")

df_images = pd.DataFrame(doc_images)
df_images.to_pickle(f"results/{file_name}.pkl")

{0: 'title', 1: 'plain text', 2: 'abandon', 3: 'figure', 4: 'figure_caption', 5: 'table', 6: 'table_caption', 7: 'table_footnote', 8: 'isolate_formula', 9: 'formula_caption'}


## OCR


In [None]:
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

with tqdm(total=df_bbs.shape[0], desc="OCR") as pbar:
    for i, row in df_bbs.iterrows():
        pbar.update(1)

        if "image_array" not in row or row["type"] in [3, 4, 6, 9] or row["image_array"].shape[0] < 1 or row["image_array"].shape[1] < 1:
            continue

        results = ocr.ocr(row["image_array"], cls=True)
        string = ""
        for line in results:
            if line:
                for word in line:
                    string += f" {word[1][0]}"

        pbar.set_postfix({"Text": string[:10]})
        df_bbs.at[i, "ocr_text"] = string

print(df_bbs["ocr_text"].head())

df_bbs.to_pickle(f"results/{file_name}_ocr.pkl")
df_bbs.to_excel(f"results/{file_name}_ocr.xlsx")

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to C:\Users\33237/.paddleocr/whl\det\en\en_PP-OCRv3_det_infer\en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:00<00:00, 5.37MiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to C:\Users\33237/.paddleocr/whl\rec\en\en_PP-OCRv4_rec_infer\en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:01<00:00, 9.46MiB/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to C:\Users\33237/.paddleocr/whl\cls\ch_ppocr_mobile_v2.0_cls_infer\ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:00<00:00, 4.00MiB/s]
OCR: 100%|██████████| 1988/1988 [31:22<00:00,  1.06it/s, Text=]          


0                         NaN
1                            
2     75 Years of Engineering
3     75 Years of Engineering
4                         NaN
Name: ocr_text, dtype: object


: 

In [None]:
file_name = "75YearsofEngineering"
df_bbs = pd.read_pickle(f"results/{file_name}_bbs.pkl").head()

Unnamed: 0,id,page,xyxy,image_array,type,type_str,type_conf
0,75YearsofEngineering_0_0,0,"[[24.594837188720703, 190.00843811035156, 547....","[[[180, 133, 111], [180, 133, 111], [182, 135,...",3,figure,0.828285
1,75YearsofEngineering_0_1,0,"[[245.9947509765625, 20.625307083129883, 324.8...","[[[178, 129, 111], [177, 127, 109], [178, 128,...",2,abandon,0.756994
2,75YearsofEngineering_0_2,0,"[[77.5875244140625, 125.02650451660156, 501.71...","[[[180, 130, 112], [181, 130, 113], [179, 129,...",0,title,0.474311
3,75YearsofEngineering_0_3,0,"[[77.5875244140625, 125.02650451660156, 501.71...","[[[180, 130, 112], [181, 130, 113], [179, 129,...",1,plain text,0.294476
4,75YearsofEngineering_1_0,1,"[[188.08712768554688, 209.71755981445312, 412....","[[[255, 255, 255], [255, 255, 255], [255, 255,...",3,figure,0.907374


In [11]:
df_bbs.head(5)

Unnamed: 0,id,page,xyxy,image_array,type,type_str,type_conf
0,75YearsofEngineering_0_0,0,"[[24.594837188720703, 190.00843811035156, 547....","[[[180, 133, 111], [180, 133, 111], [182, 135,...",3,figure,0.828285
1,75YearsofEngineering_0_1,0,"[[245.9947509765625, 20.625307083129883, 324.8...","[[[178, 129, 111], [177, 127, 109], [178, 128,...",2,abandon,0.756994
2,75YearsofEngineering_0_2,0,"[[77.5875244140625, 125.02650451660156, 501.71...","[[[180, 130, 112], [181, 130, 113], [179, 129,...",0,title,0.474311
3,75YearsofEngineering_0_3,0,"[[77.5875244140625, 125.02650451660156, 501.71...","[[[180, 130, 112], [181, 130, 113], [179, 129,...",1,plain text,0.294476
4,75YearsofEngineering_1_0,1,"[[188.08712768554688, 209.71755981445312, 412....","[[[255, 255, 255], [255, 255, 255], [255, 255,...",3,figure,0.907374


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "write a quick sort algorithm."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards:   0%|          | 0/14 [02:12<?, ?it/s]


KeyboardInterrupt: 