In [1]:
!pip install paddleocr paddlepaddle



In [2]:
import pandas as pd
from paddleocr import PaddleOCR

def parse_paddleocr_results(results):
    """
    通用解析 PaddleOCR 输出结果
    支持旧版 (box, (text, score)) 和 新版 dict 格式
    返回 pandas.DataFrame：['text', 'confidence', 'coords']
    """
    parsed_data = []

    # results[0] 是单张图的结果
    for line in results[0]:
        if isinstance(line, dict):  
            # 新版 dict 格式
            text = line.get('transcription', '')
            confidence = line.get('score', 0.0)
            coords = line.get('poly', [])
        elif isinstance(line, (list, tuple)) and len(line) == 2:
            # 旧版格式: (coords, (text, score))
            coords, info = line
            text, confidence = info
        else:
            continue  # 避免报错，跳过未知格式

        parsed_data.append({
            "text": text,
            "confidence": confidence,
            "coords": coords
        })

    return pd.DataFrame(parsed_data)


# ==== 示例使用 ====
ocr = PaddleOCR(use_textline_orientation=True)
results = ocr.predict("/Users/danchenluo/Desktop/1.jpg")

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/danchenluo/.paddlex/official_models/PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/danchenluo/.paddlex/official_models/UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/danchenluo/.paddlex/official_models/PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/danchenluo/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mModel files already exist. Usi

In [5]:
result = results[0]

In [7]:
for text, score, box in zip(result["rec_texts"], result["rec_scores"], result["rec_boxes"]):
    print(f"Text: {text}, Confidence: {score:.2f}, Box: {box}")

Text: tan woon yann, Confidence: 0.99, Box: [53 ... 33]
Text: BOOKTAK(TAMAN DAYA)SDNBHD, Confidence: 0.95, Box: [49 ... 93]
Text: 789417-W, Confidence: 0.99, Box: [200 ... 120]
Text: NO.5:55,57 &59,JALAN SAGU18, Confidence: 0.91, Box: [ 93 ... 152]
Text: TAMAN DAYA,, Confidence: 0.95, Box: [182 ... 175]
Text: 81100 JOHOR BAHRU,, Confidence: 0.91, Box: [148 ... 201]
Text: JOHOR., Confidence: 0.98, Box: [210 ... 221]
Text: Document No:TD01167104, Confidence: 0.95, Box: [ 27 ... 356]
Text: Date:, Confidence: 0.93, Box: [ 23 ... 392]
Text: 25/12/20188:13:39PM, Confidence: 0.98, Box: [150 ... 393]
Text: Cashier:, Confidence: 0.96, Box: [ 25 ... 420]
Text: MANIS, Confidence: 1.00, Box: [152 ... 415]
Text: Member:, Confidence: 0.95, Box: [ 24 ... 447]
Text: CASH BILL, Confidence: 0.96, Box: [182 ... 486]
Text: CODE/DESC, Confidence: 1.00, Box: [  1 ... 538]
Text: PRICE, Confidence: 0.98, Box: [190 ... 533]
Text: Disc, Confidence: 0.97, Box: [275 ... 530]
Text: AMOUN, Confidence: 0.95, Box: [3