In [1]:
pip install pdfplumber numpy openai redis tabulate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Cell 1: Import Libraries and Setup
import pdfplumber
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from openai import OpenAI
from redis import Redis
from redis.commands.search.field import VectorField, TextField
import numpy as np
import os

# OpenAI API setup
API_KEY = "sk-1b0e5d114e3b4bb1ac9dbef07a531b10"  # Replace with your actual API key
client = OpenAI(api_key=API_KEY, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1")

# Redis setup
r = Redis()  # Assumes Redis is running on default host and port
INDEX_NAME = "PDFData_VLforTable"
VECTOR_DIM = 1024
DISTANCE_METRIC = "COSINE"

pdf_path = "ICBC_2024_FYR.pdf"


In [3]:
# Cell 2: PDF Content Extraction
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "".join(page.extract_text() or "" for page in pdf.pages)
    return text

def detect_table_pages(pdf_path):
    """Detect pages containing tables using pdfplumber."""
    table_pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            if page.extract_tables():  # Check if page has tables
                table_pages.append(page_num)
    return table_pages

def save_page_as_image(pdf_path, page_num, output_path):
    """Save a PDF page as a PNG image."""
    pdf_document = fitz.open(pdf_path)
    page = pdf_document[page_num]
    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
    pix.save(output_path)
    pdf_document.close()

def extract_images_from_pdf(pdf_path):
    """Extract standalone images from PDF (excluding table pages)."""
    pdf_document = fitz.open(pdf_path)
    images = []
    table_pages = set(detect_table_pages(pdf_path))  # Avoid extracting images from table pages
    for page_num in range(len(pdf_document)):
        if page_num in table_pages:
            continue  # Skip pages with tables
        page = pdf_document[page_num]
        image_list = page.get_images(full=True)
        for img in image_list:
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            images.append(base_image["image"])
    pdf_document.close()
    return images

# Extract text and detect table pages
text = extract_text_from_pdf(pdf_path)
table_pages = detect_table_pages(pdf_path)
images = extract_images_from_pdf(pdf_path)

print(f"Extracted text length: {len(text)} characters")
print(f"Detected {len(table_pages)} pages with tables")
print(f"Extracted {len(images)} standalone images")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Extracted text length: 338123 characters
Detected 136 pages with tables
Extracted 17 standalone images


In [6]:
import base64
# Cell 3: Table Extraction with VL Model and Document Splitting
def split_text(text, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)
# Split text 
text_chunks = split_text(text)
print("First 20 text chunks:")
for chunk in text_chunks[:20]:
    print(chunk[:500] + "...")

def encode_image(image_path):
    """Encode image to base64."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def extract_table_from_page_image(page_num):
    """Use VL model to extract table from page image in Markdown format."""
    image_path = f"table_page_{page_num}.png"
    save_page_as_image(pdf_path, page_num, image_path)
    base64_image = encode_image(image_path)
    
    try:
        response = client.chat.completions.create(
            model="qwen2.5-vl-7b-instruct",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Extract the table from the image and return it in Markdown format. If no table is present, return an empty string. Do not include additional explanations."
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                        }
                    ]
                }
            ],
            temperature=0.2
        )
        table_markdown = response.choices[0].message.content
        # Clean up image file
        os.remove(image_path)
        return table_markdown if table_markdown.strip() else ""
    except Exception as e:
        print(f"Error processing table on page {page_num}: {str(e)}")
        os.remove(image_path)
        return ""

# Process tables from detected table pages
table_chunks = []
for page_num in table_pages:
    table_markdown = extract_table_from_page_image(page_num)
    if table_markdown:
        table_chunks.append(table_markdown)
        print(f"Extracted table from page {page_num}: {table_markdown[:500]}...")

# 为所有保存的图片生成描述
image_descriptions = []
    
for i in range(len(images)):
    image_path = f"image_{i}.png"  # 与之前保存的图片路径一致
    base64_image = encode_image(image_path)
    try:
        response = client.chat.completions.create(
            model="qwen2.5-vl-7b-instruct",
            messages=[
                {"role": "user", "content": [
                    {"type": "text", "text": "提取图片中的信息，需要精准描述，不要漏掉信息，但是也不需要额外解释。若图片为饼状图、折现图和柱状图等，请使用饼状图、折现图和柱状图等关键词，并以 json 格式返回。若图片为表格，请使用表格的关键词，并以 markdown 格式返回。若图片为架构图、流程图等，请使用架构图、流程图等关键词，并以 mermaid 格式返回。"},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
            ],}]
        )
        description = response.choices[0].message.content
        image_descriptions.append(description)
        print(f"Generated description for image {i}: {description}")
    except Exception as e:
        print(f"Error processing image {i}: {str(e)}")
        image_descriptions.append("")
print(f"Split text into {len(text_chunks)} chunks")
print(f"Converted {len(table_chunks)} tables to Markdown")
print(f"Generated {len(image_descriptions)} image descriptions")

First 20 text chunks:
中国工商银行股份有限公司
（股票代码：601398）
2024 年度报告公司简介
中国工商银行成立于 1984年1月1日。2005年10月28日，本行整体改制
为股份有限公司。2006 年 10 月 27 日，本行成功在上交所和香港联交所同日挂
牌上市。
本行致力于建设中国特色世界一流现代金融机构，拥有优质的客户基础、多
元的业务结构、强劲的创新能力和市场竞争力。本行将服务作为立行之本，坚持
以服务创造价值，向全球超1,300万对公客户和 7.66亿个人客户提供丰富的金融
产品和优质的金融服务，以自身高质量发展服务经济社会高质量发展。本行自觉
将社会责任融入发展战略和经营管理活动，在服务制造业、发展普惠金融、支持
乡村振兴、发展绿色金融、支持公益事业等方面受到广泛赞誉。
本行始终聚焦主业，坚持服务实体经济的本源，与实体经济共荣共存、共担
风雨、共同成长；始终坚持风险为本，牢牢守住底线，不断提高控制和化解风险
的能力；始终坚持对商业银行经营规律的把握与遵循，致力于成为基业长青的银
行；始终坚持稳中求进、创新求进，持续深化重点发展战略，积极发展金融科技，
加快数字化转型；始终坚持专业专注，...
动能、多元化结构、生态化基础等“五化”转型。
中国工商银行股份有限公司2024年度报告（A股） 2使 命
提供卓越金融服务
服务客户 回报股东 成就员工 奉献社会
愿 景
全面建设具有中国特色的世界一流现代金融企业，
成为基业长青的银行
价值观
工于至诚 行以致远
诚信 人本 稳健 创新 卓越
中国工商银行股份有限公司2024年度报告（A股） 3目录
1. 释义 ........................................................................................................................ 5
2. 2024 年主要排名与奖项 ....................................................................................... 7
3. 重要提示 .....................................................

In [None]:
from tqdm.notebook import tqdm  # Jupyter专用进度条

# Cell 4: Vectorization (with progress bars)
def get_embedding(text, model="text-embedding-v3"):
    if not text.strip():
        return None
    response = client.embeddings.create(
        input=text[:8192*2],  # Truncate to approximate token limit
        model=model,
        dimensions=VECTOR_DIM
    )
    return response.data[0].embedding

# Generate embeddings with progress bars
print("Generating text embeddings...")
text_embeddings = [get_embedding(chunk) for chunk in tqdm(text_chunks, desc="Text chunks")]

print("\nGenerating table embeddings...")
table_embeddings = [get_embedding(chunk) for chunk in tqdm([c for c in table_chunks if c], desc="Tables")]

print("\nGenerating image embeddings...")
image_embeddings = [get_embedding(desc) for desc in tqdm(image_descriptions, desc="Images")]


In [None]:
# Cell 5: Data Storage in Redis
# Define index schema
md_embedding_field = VectorField(
    "md_embedding", "FLAT",
    {"TYPE": "FLOAT32", "DIM": VECTOR_DIM, "DISTANCE_METRIC": DISTANCE_METRIC}
)
content_field = TextField("content")
type_field = TextField("type")
fields_for_index = [content_field, type_field, md_embedding_field]

# Create index
try:
    r.ft(INDEX_NAME).create_index(fields=fields_for_index)
    print(f"Index '{INDEX_NAME}' created successfully")
except Exception as e:
    print(f"Index creation failed: {e}")

# Store data
def store_data(chunks, embeddings, data_type):
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        key = f"{INDEX_NAME}:{data_type}:{i}"
        mapping_data = {
            "md_embedding": np.array(embedding, dtype=np.float32).tobytes(),
            "content": chunk,
            "type": data_type
        }
        r.hset(key, mapping=mapping_data)

store_data(text_chunks, text_embeddings, "text")
store_data(table_chunks, table_embeddings, "table")
store_data(image_descriptions, image_embeddings, "image")

print("Data stored in Redis successfully")

In [None]:
import numpy as np
import json
from redis.commands.search.query import Query
# Search
# user_question = "工商银行2024年海外布局？"
user_question = "工商银行为什么被成为”宇宙行“？"

# Helper functions
def json_gpt(input: str):
    completion = client.chat.completions.create(
        model="qwen2.5-7b-instruct",
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.2,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


HA_INPUT = f"""
You are a financial report analysis assistant.
You have access to a search API that returns relevant sections from a financial report.
Generate a search query by extracting key words from the user's question.

User question: {user_question}

Format: {{"searchQuery": "search query"}}
"""
query_str = json_gpt(HA_INPUT)["searchQuery"]
print(query_str)

query_embedding = client.embeddings.create(input=query_str, model="text-embedding-v3", dimensions=1024, encoding_format="float")
query_vec = np.array(query_embedding.data[0].embedding, dtype=np.float32).tobytes()
# Prepare the query
k_nearest = 3  # Retrieve top 3 relevant chunks
query_base = (Query(f"*=>[KNN {k_nearest} @md_embedding $vec as score]").sort_by("score").return_fields("score", "content", "type").dialect(2))
query_param = {"vec": query_vec}
try:
    query_results = r.ft(INDEX_NAME).search(query_base, query_param).docs
    print(f"\nRetrieved {len(query_results)} results:")
    for i, doc in enumerate(query_results):
        print(f"\n--- Result {i+1} ---")
        print(f"Type: {doc.type}")
        print(f"Score: {doc.score}")
        print(f"Content (first 200 chars): {doc.content[:200]}...")
except Exception as e:
    print(f"Error executing vector query: {e}")
    query_results = []
# Prepare context for the LLM
context = "\n\n".join([doc.content for doc in query_results])

In [None]:
system_prompt = "你是一个金融报告分析专家。请根据搜索结果回答用户提问，注意，请务必首先依赖搜索结果，而不是你自己已有的知识。如果搜索结果不包含能够回答用户提问的信息，你可以说“抱歉，我无法回答这个问题”。"
messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": "用户提问：" + user_question},
            {"role": "user", "content": "搜索结果：" + context}]
response = client.chat.completions.create(
                    messages=messages,
                    model="qwen2.5-32b-instruct", #"qwen-max",
                    max_tokens=1000
                )
print(response.choices[0].message.content)