In [1]:
import cv2
from paddleocr import PaddleOCR, draw_ocr
from pdf2image import convert_from_path
import os

from paddlex import create_pipeline

import os
import glob
import json

from pdf2image import convert_from_path

import json
import os


from dotenv import load_dotenv

from openai import OpenAI



In [2]:
property = "finance"

# Change PDF to IMG

In [23]:

pdf_path_list = glob.glob(f'競賽資料集/reference/{property}/*')
output_folder_prefix = f'pdf_images/{property}'

for pdf_path in pdf_path_list:

    file_name = os.path.basename(pdf_path)
    output_folder = os.path.join(output_folder_prefix, file_name)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images = convert_from_path(os.path.join(pdf_path))
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'page_{i+1}.jpg')
        image.save(image_path, 'JPEG')


# OCR

In [68]:
import os
from paddlex import create_pipeline
from opencc import OpenCC
from tqdm import tqdm

cc = OpenCC('s2t')  

pipeline = create_pipeline(pipeline="OCR")

img_dir = f'pdf_images/{property}'

output_text_dict = {}

start_index = 750
for i, (root, dirs, files) in enumerate(tqdm(list(os.walk(img_dir)), desc="Processing directories")):
    if i < start_index:
        continue
    files.sort()
    for file in files:
        if file.endswith(('.jpg', '.jpeg', '.png')):
            img_path = os.path.join(root, file)
            
            parent_folder = os.path.basename(os.path.dirname(img_path))

            if parent_folder not in output_text_dict:
                output_text_dict[parent_folder] = []
                print(parent_folder)
            else:
                break

            output = pipeline.predict(img_path)
            for res in output:
                traditional_text = [cc.convert(text) for text in res["rec_text"]]
                output_text_dict[parent_folder].append({"text":traditional_text, "input_path":res["input_path"]})

[32mUsing official model (PP-OCRv4_mobile_det), the model files will be be automatically downloaded and saved in /Users/llin07/.paddlex/official_models.[0m
[32mUsing official model (PP-OCRv4_mobile_rec), the model files will be be automatically downloaded and saved in /Users/llin07/.paddlex/official_models.[0m
Processing directories: 100%|██████████| 1036/1036 [00:00<00:00, 1312779.14it/s]


In [67]:
output_file_path = f"output/ocr_{property}_output.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(output_text_dict, f, ensure_ascii=False, indent=4)

print(f"OCR results saved to {output_file_path}")

OCR results saved to output/ocr_finance_output.json


# convert ocr result to mackdown

In [114]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [115]:
with open(f"output/ocr_{property}_output.json", 'r', encoding='utf-8') as f:
    ocr_output_dict = json.load(f)

In [116]:
tasks = []
file_paths = [] 

for documents in ocr_output_dict:
    for page in ocr_output_dict[documents]:
        text_list = page["text"]
        text_path = page["input_path"].replace('pdf_images/', 'pdf_md/').replace('.jpg', '.md')
        
    

        prompt = (
                        f"Convert the OCR results into Markdown format. "
                        f"The input is a list of text extracted by the OCR model, containing both free text and tables from {property}-related documents. "
                        f"Please transform this list into Markdown format, correcting any typos you find.\n"
                        f"Output the result in JSON format, like this: {{'markdown': 'generate markdown content here'}}.\n"
                        f"Input text: {text_list}\n"
                    )
        
        task = {
            "custom_id": text_path,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "temperature": 0.1,
                "response_format": { 
                    "type": "json_object"
                },
                "messages": [
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
            }
        }
        
        tasks.append(task)

In [112]:
file_name = "batch_tasks_ocr2md.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [117]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)
print(batch_file)

In [119]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

## Get result from onenapi

In [123]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_6717a41d61f881909087b5ab379ee8cb', completion_window='24h', created_at=1729602589, endpoint='/v1/chat/completions', input_file_id='file-PEWgHdaExlbbwYX77ZPZ8B3P', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1729608922, error_file_id='file-ixliCTzQoOKUZww8adXlHKeU', errors=None, expired_at=None, expires_at=1729688989, failed_at=None, finalizing_at=1729607701, in_progress_at=1729602595, metadata=None, output_file_id='file-bnGjb49yAOuLSyhcnoUSnROd', request_counts=BatchRequestCounts(completed=4473, failed=2, total=4475))


In [124]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [126]:
result_file_name = "batch_job_results_ocr2md.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [138]:
def extract_markdown(s):
    s = s.strip("'").rstrip(",").strip()
    try:
        data = json.loads(s)
        markdown_content = data.get('markdown', '')
        markdown_content = markdown_content.replace('\\n', '\n')
        return markdown_content
    except json.JSONDecodeError as e:
        print("解析 JSON 時發生錯誤：", e)
        return ''


In [139]:
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        json_object = json.loads(line.strip())

        file_path = json_object["custom_id"]
        try:
            markdown_content = json.loads(json_object["response"]["body"]["choices"][0]["message"]["content"])["markdown"]
        except:
            markdown_content = extract_markdown(json_object["response"]["body"]["choices"][0]["message"]["content"])

        dir_path = os.path.dirname(file_path)
        os.makedirs(os.path.dirname(dir_path), exist_ok=True)
        
        with open(file_path, 'w', encoding='utf-8') as md_file:
            md_file.write(markdown_content)

        results.append({
            'file_path': file_path,
            'markdown_content': markdown_content
        })

print(f"Processed and saved {len(results)} files.")

解析 JSON 時發生錯誤： Unterminated string starting at: line 2 column 15 (char 16)
解析 JSON 時發生錯誤： Unterminated string starting at: line 2 column 15 (char 16)
Processed and saved 4473 files.


# extraction keyword

In [10]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [53]:
import os
import glob

tasks = []
file_paths = [] 

# Define the directory path
pdf_md_finance_dir = 'pdf_md/finance/*'

# Read all files in the pdf_md/finance directory
file_paths = glob.glob(os.path.join(pdf_md_finance_dir, '*'))

for file in file_paths:

    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    keyword_extraction_prompt = (
        f"Please extract keywords from the input text, which may be in English or Traditional Chinese. "
        f"The input text could be a sentence or content in Markdown format. "
        f"Extract the keywords and classify them into the following categories: Person, Place, Company, Time, or {property} terminology."
        f"Category Descriptions:"
        f"Person: Names of individuals"
        f"Place: e.g., Taipei"
        f"Company: Names of companies in Taiwan, e.g., 長榮, 亞德客-KY, or 光寶科技股份有限公司"
        f"Time: e.g., 2022年第3季"
        f"{property} terminology: Specific terms, e.g., 要保人, 受益人, 綜合損益總額, 淨現金流出, 合併權益變動表, 資產總額, 合併資產總額, 營業利益"
        f"\nPlease output in JSON format, like this example: {{'Person': [], 'Place': [], 'Company': ['長榮'], 'Time': [], '{{property}} terminology': ['綜合損益總額']}}"
        f"The input text is {text}"
    )

    
    task = {
            "custom_id": file,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "temperature": 0.1,
                "response_format": { 
                    "type": "json_object"
                },
                "messages": [
                    {
                        "role": "user",
                        "content": keyword_extraction_prompt
                    }
                ],
            }
        }
        
    tasks.append(task)
    


In [58]:
# Split tasks into 3 parts
num_tasks = len(tasks)
chunk_size = num_tasks // 3
chunks = [tasks[i:i + chunk_size] for i in range(0, num_tasks, chunk_size)]

# Ensure we have exactly 3 chunks by adjusting the last chunk if necessary
if len(chunks) > 3:
    chunks[2].extend(chunks[3:])
    chunks = chunks[:3]

for i, chunk in enumerate(chunks, 1):
    file_name = f"batch_tasks_md2keyword_part{i}.jsonl"
    with open(file_name, 'w') as file:
        for obj in chunk:
            file.write(json.dumps(obj) + '\n')

In [76]:
file_name = "batch_tasks_md2keyword_part3.jsonl"

batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)
print(batch_file)

FileObject(id='file-pEXNrFf9skPR8aQyvKK9EEOE', bytes=7053968, created_at=1729849706, filename='batch_tasks_md2keyword_part3.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [77]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [78]:
batch_job.id

'batch_671b696ca2288190830d764842fcbf36'

## get results

In [84]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_671b696ca2288190830d764842fcbf36', completion_window='24h', created_at=1729849708, endpoint='/v1/chat/completions', input_file_id='file-pEXNrFf9skPR8aQyvKK9EEOE', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1729851327, error_file_id=None, errors=None, expired_at=None, expires_at=1729936108, failed_at=None, finalizing_at=1729850622, in_progress_at=1729849709, metadata=None, output_file_id='file-6Ab7dKU2m4ztk5fHw3R5jhBQ', request_counts=BatchRequestCounts(completed=1491, failed=0, total=1491))


In [85]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [86]:
result_file_name = "batch_job_results3_md2kw.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [94]:
from collections import defaultdict

split_file_list = ["batch_job_results1_md2kw.jsonl", "batch_job_results2_md2kw.jsonl", "batch_job_results3_md2kw.jsonl"]

results = defaultdict(list)
for result_file_name in split_file_list:
    with open(result_file_name, 'r') as file:
        for line in file:
            json_object = json.loads(line.strip())

            file_path = json_object["custom_id"]
            try:
                json_content = json.loads(json_object["response"]["body"]["choices"][0]["message"]["content"])
            except:
                print(f"ERROR to conver {file_path}")

            dir_path = os.path.dirname(file_path)

            results[os.path.dirname(file_path).replace('pdf_md/', '')].append({
                'page': os.path.basename(file_path),
                'kw': json_content
            })

print(f"Processed and saved {len(results)} files.")

Processed and saved 1035 files.


In [97]:
# Save the results dictionary as a JSON file
output_file = 'output/finance_kw_extraction.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Results saved to {output_file}")

Results saved to output/finance_kw_extraction.json


# Extraction Query KW

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [98]:
file_path = '競賽資料集/dataset/preliminary/questions_example.json'

with open(file_path, 'r', encoding='utf-8') as file:
    questions = json.load(file)

In [101]:
questions["questions"][0]

{'qid': 1,
 'source': [442, 115, 440, 196, 431, 392, 14, 51],
 'query': '匯款銀行及中間行所收取之相關費用由誰負擔?',
 'category': 'insurance'}

In [112]:
import os
import glob

tasks = []
file_paths = [] 

# Define the directory path
pdf_md_finance_dir = 'pdf_md/finance/*'

# Read all files in the pdf_md/finance directory
file_paths = glob.glob(os.path.join(pdf_md_finance_dir, '*'))

for data in questions["questions"]:

    text = data["query"]
    
    keyword_extraction_prompt = (
        f"Please extract keywords from the input text, which may be in English or Traditional Chinese. "
        f"The input text could be a sentence or content in Markdown format. "
        f"Extract the keywords and classify them into the following categories: Person, Place, Company, Time, or {property} terminology."
        f"Category Descriptions:"
        f"Person: Names of individuals"
        f"Place: e.g., Taipei"
        f"Company: Names of companies in Taiwan, e.g., 長榮, 亞德客-KY, or 光寶科技股份有限公司"
        f"Time: e.g., 2022年第3季"
        f"{property} terminology: Specific terms, e.g., 要保人, 受益人, 綜合損益總額, 淨現金流出, 合併權益變動表, 資產總額, 合併資產總額, 營業利益"
        f"\nPlease output in JSON format, like this example: {{'Person': [], 'Place': [], 'Company': ['長榮'], 'Time': [], '{{property}} terminology': ['綜合損益總額']}}"
        f"The input text is {text}"
    )

    
    task = {
            "custom_id": str(data["qid"]),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "temperature": 0.1,
                "response_format": { 
                    "type": "json_object"
                },
                "messages": [
                    {
                        "role": "user",
                        "content": keyword_extraction_prompt
                    }
                ],
            }
        }
        
    tasks.append(task)
    


In [113]:
file_name = "question2kw_batch.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [114]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)
print(batch_file)

FileObject(id='file-lRpYDjSe88bO8ovyv2S8hHmL', bytes=208652, created_at=1729952106, filename='question2kw_batch.jsonl', object='file', purpose='batch', status='processed', status_details=None)


In [115]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

## Get result

In [118]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_671cf96c72008190b0a2d95f05e1e314', completion_window='24h', created_at=1729952108, endpoint='/v1/chat/completions', input_file_id='file-lRpYDjSe88bO8ovyv2S8hHmL', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1729952186, error_file_id=None, errors=None, expired_at=None, expires_at=1730038508, failed_at=None, finalizing_at=1729952173, in_progress_at=1729952108, metadata=None, output_file_id='file-hUqKbSLBOTqWQXBF9VKBvvOk', request_counts=BatchRequestCounts(completed=150, failed=0, total=150))


In [119]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [121]:
result_file_name = "question2kw.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [122]:
results = {}
with open(result_file_name, 'r') as file:
    for line in file:
        json_object = json.loads(line.strip())

        custom_id = json_object["custom_id"]
        try:
            content = json.loads(json_object["response"]["body"]["choices"][0]["message"]["content"])
        except:
            print(f"Error convert json {custom_id}")



        results[custom_id] = content

print(f"Processed and saved {len(results)} files.")

Processed and saved 150 files.


In [124]:
with open('output/finance_question_kw.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print(f"Results saved to output/finance_question_kw.json")

Results saved to output/finance_question_kw.json
