In [1]:
pip install pdfplumber numpy openai redis tabulate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Cell 1: Import Libraries and Setup
import pdfplumber
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from openai import OpenAI
from redis import Redis
from redis.commands.search.field import VectorField, TextField
import numpy as np
import os

# OpenAI API setup
API_KEY = "sk-1b0e5d114e3b4bb1ac9dbef07a531b10"  # Replace with your actual API key
client = OpenAI(api_key=API_KEY, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1")

# Redis setup
r = Redis()  # Assumes Redis is running on default host and port
INDEX_NAME = "PDFData"
VECTOR_DIM = 1024
DISTANCE_METRIC = "COSINE"

pdf_path = "联想集团ESG解决方案手册.pdf"


In [4]:
# Cell 2: PDF Content Extraction
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "".join(page.extract_text() or "" for page in pdf.pages)
    return text

def detect_table_pages(pdf_path):
    """Detect pages containing tables using pdfplumber."""
    table_pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            if page.extract_tables():  # Check if page has tables
                table_pages.append(page_num)
    return table_pages

def save_page_as_image(pdf_path, page_num, output_path):
    """Save a PDF page as a PNG image."""
    pdf_document = fitz.open(pdf_path)
    page = pdf_document[page_num]
    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
    pix.save(output_path)
    pdf_document.close()

def extract_images_from_pdf(pdf_path):
    """Extract standalone images from PDF (excluding table pages)."""
    pdf_document = fitz.open(pdf_path)
    images = []
    table_pages = set(detect_table_pages(pdf_path))  # Avoid extracting images from table pages
    for page_num in range(len(pdf_document)):
        if page_num in table_pages:
            continue  # Skip pages with tables
        page = pdf_document[page_num]
        image_list = page.get_images(full=True)
        for img in image_list:
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            images.append(base_image["image"])
    pdf_document.close()
    return images

# Extract text and detect table pages
text = extract_text_from_pdf(pdf_path)
table_pages = detect_table_pages(pdf_path)
images = extract_images_from_pdf(pdf_path)

print(f"Extracted text length: {len(text)} characters")
print(f"Detected {len(table_pages)} pages with tables")
print(f"Extracted {len(images)} standalone images")


Extracted text length: 22002 characters
Detected 34 pages with tables
Extracted 6 standalone images


In [5]:
import base64
# Cell 3: Table Extraction with VL Model and Document Splitting
def split_text(text, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)
# Split text 
text_chunks = split_text(text)
print("First 20 text chunks:")
for chunk in text_chunks[:20]:
    print(chunk[:500] + "...")

def encode_image(image_path):
    """Encode image to base64."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")  


def extract_table_from_page_image(page_num):
    """Use VL model to extract table from page image in Markdown format."""
    image_path = f"table_page_{page_num}.png"
    save_page_as_image(pdf_path, page_num, image_path)
    base64_image = encode_image(image_path)
    
    try:
        response = client.chat.completions.create(
            model="qwen2.5-vl-7b-instruct",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Extract the table from the image and return it in Markdown format. If no table is present, return an empty string. Do not include additional explanations."
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                        }
                    ]
                }
            ],
            temperature=0.2
        )
        table_markdown = response.choices[0].message.content
        return table_markdown if table_markdown.strip() else ""
    except Exception as e:
        print(f"Error processing table on page {page_num}: {str(e)}")
        return ""

# Process tables from detected table pages
table_chunks = []
for page_num in table_pages:
    table_markdown = extract_table_from_page_image(page_num)
    if table_markdown:
        table_chunks.append(table_markdown)
        print(f"Extracted table from page {page_num}: {table_markdown[:500]}...")

# 为所有保存的图片生成描述
image_descriptions = []
for i, image in enumerate(images):
    image_path = f"image_{i}.png"
    with open(image_path, "wb") as f:
        f.write(image)
    base64_image = encode_image(image_path)
    try:
        response = client.chat.completions.create(
            model="qwen2.5-vl-32b-instruct",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "提取图片中的信息，需要精准描述，不要漏掉信息，但是也不需要额外解释。若图片为饼状图、折现图和柱状图等，请使用饼状图、折现图和柱状图等关键词，并以 json 格式返回。若图片为表格，请使用表格的关键词，并以 markdown 格式返回。若图片为架构图、流程图等，请使用架构图、流程图等关键词，并以 mermaid 格式返回。"
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                        }
                    ]
                }
            ],
            temperature=0.2
        )
        description = response.choices[0].message.content
        image_descriptions.append(description)
        print(f"Generated description for image {i}: {description[:200]}...")
    except Exception as e:
        print(f"Error processing image {i}: {str(e)}")
        image_descriptions.append("")

print(f"Split text into {len(text_chunks)} chunks")
print(f"Converted {len(table_chunks)} tables to Markdown")
print(f"Generated {len(image_descriptions)} image descriptions")

First 20 text chunks:
联想可持续发展解决方案及服务
增强企业社会责任、共创绿色可持续未来
联想 智慧中国戴 炜
联想集团高级副总裁
中国方案服务业务群总经理
人工智能犹如历史上的蒸汽机、电力、计算机和互联网等通用技术一样，正在成为影响人类未来的
基础动力。近十年来，人工智能正以史无前例的进化速度，不断实现自我进化，同时也在迅速改变着
社会生产力结构和经济发展，逐步成为驱动新一轮科技和产业变革的重要源泉，使得第四次工业革
命也就此进入了全新阶段。
企业的数字化进程持续了几十年，从全球的范围看，各个行业中的领先企业也快速进入了智能化阶
AI
段，人工智能的应用在企业市场寻找到了更具价值的场景，而随着生成式 的诞生和全球范围的应
用，企业及社会机构的数字化、智能化进程也发生了巨大的改变。企业的信息系统架构及应用的开
发、部署、实施和应用都因生成式人工智能的发展面临着前所未有的变革机遇。
AI
企业的信息系统架构经过了传统架构到云架构，发展至今正在面临第三代架构的变革挑战，基于
AI
原生的企业信息架构是支持未来企业 应用及大模型应用的基础的原则，企业信息系统的全面智
能化，将由此开启全新的产业进程。
联想从上世纪...
化运营水平和数字化创新能力，更大的价值正是基于企业大模型的应用，更加充分地释放企业的生
产力，激活企业潜能，成为我国新质生产发展的组成部分。
AI
联想方案服务希望能通过对自身服务及产品的介绍，在 新时代，助力我们的客户，更好地实现企
业的智能化转型。3S 2
一 联想 战略
2
二 联想方案服务历程
ESG 3
三 联想 实践
4
四 行业概述及挑战
五 联想可持续发展解决方案及服务
5
联想可持续发展解决方案及服务
ESG 6
联想 咨询服务
ESG Navigator 7
联想 乐循解决方案
环境
09
联想零碳服务
10
联想零立方服务
IT 11
联想 设备再生服务
IT ARS 11
联想 资产回收服务
IT ATS 12
联想 资产环保处置
PC 14
联想 官方翻新服务
15
联想零碳智慧园区解决方案
19
联想低碳数据中心解决方案
21
联想温水水冷服务器
社会
25
联想全球学习中心
公司治理
27
联想合规治理服务
29
六 最佳实践
ESG 33
七 联想 社会价值2022 - 2023
联想方案服务发展历程


In [6]:
from tqdm.notebook import tqdm  # Jupyter专用进度条

# Cell 4: Vectorization (with progress bars)
def get_embedding(text, model="text-embedding-v3"):
    if not text.strip():
        return None
    response = client.embeddings.create(
        input=text[:8192*2],  # Truncate to approximate token limit
        model=model,
        dimensions=VECTOR_DIM
    )
    return response.data[0].embedding

# Generate embeddings with progress bars
print("Generating text embeddings...")
text_embeddings = [get_embedding(chunk) for chunk in tqdm(text_chunks, desc="Text chunks")]

print("\nGenerating table embeddings...")
table_embeddings = [get_embedding(chunk) for chunk in tqdm([c for c in table_chunks if c], desc="Tables")]

print("\nGenerating image embeddings...")
image_embeddings = [get_embedding(desc) for desc in tqdm(image_descriptions, desc="Images")]


Generating text embeddings...


Text chunks:   0%|          | 0/28 [00:00<?, ?it/s]


Generating table embeddings...


Tables:   0%|          | 0/33 [00:00<?, ?it/s]


Generating image embeddings...


Images:   0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
# Cell 5: Data Storage in Redis
# Define index schema
md_embedding_field = VectorField(
    "md_embedding", "FLAT",
    {"TYPE": "FLOAT32", "DIM": VECTOR_DIM, "DISTANCE_METRIC": DISTANCE_METRIC}
)
content_field = TextField("content")
type_field = TextField("type")
fields_for_index = [content_field, type_field, md_embedding_field]

# Create index
try:
    r.ft(INDEX_NAME).create_index(fields=fields_for_index)
    print(f"Index '{INDEX_NAME}' created successfully")
except Exception as e:
    print(f"Index creation failed: {e}")

# Store data
def store_data(chunks, embeddings, data_type):
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        key = f"{INDEX_NAME}:{data_type}:{i}"
        mapping_data = {
            "md_embedding": np.array(embedding, dtype=np.float32).tobytes(),
            "content": chunk,
            "type": data_type
        }
        r.hset(key, mapping=mapping_data)

store_data(text_chunks, text_embeddings, "text")
store_data(table_chunks, table_embeddings, "table")
store_data(image_descriptions, image_embeddings, "image")

print("Data stored in Redis successfully")

Index creation failed: Index already exists
Data stored in Redis successfully


In [14]:
import numpy as np
import json
from redis.commands.search.query import Query
# Search
# user_question = "工商银行2024年海外布局？"
user_question = "工商银行为什么被成为”宇宙行“？"

# Helper functions
def json_gpt(input: str):
    completion = client.chat.completions.create(
        model="qwen2.5-7b-instruct",
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.2,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


HA_INPUT = f"""
You are a financial report analysis assistant.
You have access to a search API that returns relevant sections from a financial report.
Generate a search query by extracting key words from the user's question.

User question: {user_question}

Format: {{"searchQuery": "search query"}}
"""
query_str = json_gpt(HA_INPUT)["searchQuery"]
print(query_str)

query_embedding = client.embeddings.create(input=query_str, model="text-embedding-v3", dimensions=1024, encoding_format="float")
query_vec = np.array(query_embedding.data[0].embedding, dtype=np.float32).tobytes()
# Prepare the query
k_nearest = 3  # Retrieve top 3 relevant chunks
query_base = (Query(f"*=>[KNN {k_nearest} @md_embedding $vec as score]").sort_by("score").return_fields("score", "content", "type").dialect(2))
query_param = {"vec": query_vec}
try:
    query_results = r.ft(INDEX_NAME).search(query_base, query_param).docs
    print(f"\nRetrieved {len(query_results)} results:")
    for i, doc in enumerate(query_results):
        print(f"\n--- Result {i+1} ---")
        print(f"Type: {doc.type}")
        print(f"Score: {doc.score}")
        print(f"Content (first 200 chars): {doc.content[:200]}...")
except Exception as e:
    print(f"Error executing vector query: {e}")
    query_results = []
# Prepare context for the LLM
context = "\n\n".join([doc.content for doc in query_results])

工商银行 宇宙行

Retrieved 3 results:

--- Result 1 ---
Type: text
Score: 0.391143679619
Content (first 200 chars): 电话：0755-82246400 传真：0571-87808207
传真：0755-82246247
四川分行 工银瑞信基金管理有限公司
地址：四川省成都市锦江区总府路45号 地址：北京市西城区金融大街5号新盛大厦
邮编：610020 A座
电话：028-82866000 邮编：100033
传真：028-82866025 电话：010-66583349
传真：010-66583158
天津分行
...

--- Result 2 ---
Type: text
Score: 0.393015921116
Content (first 200 chars): 有限公司
工银国际控股有限公司 投资银行 59.63亿港元 7,016.21 1,058.98 2.70
中国工商银行（澳门）
商业银行 5.89亿澳门元 46,577.07 3,913.78 18.47
股份有限公司
中国工商银行（印度尼 3.71万亿
商业银行 3,147.68 449.06 29.07
西亚）有限公司 印尼盾
中国工商银行马来西亚
商业银行 8.33亿林吉特 1,190.99...

--- Result 3 ---
Type: text
Score: 0.39324349165
Content (first 200 chars): 已发行股本/
股权比例% 实收资本面值
2024年 2023年 2024年
公司名称 12月31日 12月31日 12月31日 本行投资额 注册地及成立日期 业务性质
通过设立或投资等方式
取得的主要子公司：
中国工商银行马来西 马来西亚吉隆坡
亚有限公司 100 100 8.33亿林吉特 8.33亿林吉特 2010年1月28日 商业银行
中国工商银行（阿拉木 哈萨克斯坦阿拉木图
图）股份公司 1...


In [15]:
system_prompt = "你是一个金融报告分析专家。请根据搜索结果回答用户提问，注意，请务必首先依赖搜索结果，而不是你自己已有的知识。如果搜索结果不包含能够回答用户提问的信息，你可以说“抱歉，我无法回答这个问题”。"
messages = [{"role": "system", "content": system_prompt},
            {"role": "user", "content": "用户提问：" + user_question},
            {"role": "user", "content": "搜索结果：" + context}]
response = client.chat.completions.create(
                    messages=messages,
                    model="qwen2.5-32b-instruct", #"qwen-max",
                    max_tokens=1000
                )
print(response.choices[0].message.content)

抱歉，我无法回答这个问题。提供的搜索结果中并没有关于工商银行被称为"宇宙行"的原因的信息。
