In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 加载PDF文件（请确保文件路径正确）
loader = PyPDFLoader("WHITEPAPER_Future_of_Sustainability_2025.pdf")
docs = loader.load()

# 使用文本切分器进行切分
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

# 提取每个块的文本内容
text_lines = [chunk.page_content for chunk in chunks]

# 打印前几个文本块以验证结果
for i, text in enumerate(text_lines[:3]):
    print(f"块 {i+1}:\n{text}\n{'-'*40}\n")



块 1:
The future of 
sustainability
Navigating trends 
and innovations for 
a sustainable tomorrow
FEBRUARY 2025 | MICHAEL HANF, LEAD SUSTAINABLE BUSINESS, VTT
----------------------------------------

块 2:
Michael Hanf (2025), The future of sustainability - Navigating 
trends and innovations for a sustainable tomorrow, 
VTT Technical Research Centre of Finland, Espoo, Finland.
Author: Michael Hanf
Contributors: Maria Akerman, Sajad Ashouri, Arash Hajikhani, 
Kalle Kantola, Tiina Koljonen, Sofi Kurki, Annu Markkula, 
Maaria Nuutinen, Hanna Pihkola, Antti-Jussi Tahvanainen, 
Nina Wessberg
For enquiries, please contact the author, Michael Hanf, 
at michael.hanf@vtt.fi
© VTT Technical Research Centre of Finland, 2025
----------------------------------------

块 3:
The future of sustainability: 
Navigating trends and innovations  
for a sustainable tomorrow
0/ Executive summary  . . . . . . . . . . . . . . . . . . . . . . . 5
Methodology & approach . . . . . . . . . . . . . . . . . . . . . .

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

# 加载预训练的嵌入模型
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 为每个文本块生成嵌入
embeddings = embedding_model.encode(text_lines)

# 检查生成的嵌入维度和部分结果
print("嵌入的形状:", np.array(embeddings).shape)
print("第一条嵌入示例:", embeddings[0][:10])  # 打印第一条嵌入的前10个数值



嵌入的形状: (387, 384)
第一条嵌入示例: [ 0.02845221  0.02056801  0.05199016 -0.03178968  0.04272494  0.01241189
 -0.10353621 -0.0057232  -0.05912128  0.01031158]


In [3]:
from pymilvus import connections

connections.connect(host="localhost", port="19530")
print("成功连接到 Milvus!")


成功连接到 Milvus!


In [4]:
from sentence_transformers import SentenceTransformer
from pymilvus import MilvusClient
import numpy as np

# 1. 加载轻量级高性能嵌入模型
embedding_model = SentenceTransformer("intfloat/e5-small-v2")  # 非常适合本地 CPU 环境

# 2. 定义嵌入函数（注意加前缀 "passage:"，这是 e5 模型推荐用法）
def emb_text(text):
    return embedding_model.encode([f"passage: {text}"])[0]

# 3. 连接 Milvus
milvus_client = MilvusClient(uri="http://localhost:19530")
collection_name = "rag_collection"

# 4. 删除旧集合（如果存在）
if collection_name in milvus_client.list_collections():
    milvus_client.drop_collection(collection_name)
    print(f"⚠️ 旧集合 {collection_name} 已删除")

# 5. 创建新集合（e5-small-v2 是 384维）
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=384,
    metric_type="IP",
    consistency_level="Strong"
)
print(f"✅ 创建集合 {collection_name} 成功")



⚠️ 旧集合 rag_collection 已删除
✅ 创建集合 rag_collection 成功


In [5]:
# 6. 生成新嵌入（推荐带进度条）
from tqdm import tqdm
data = []
for i, line in enumerate(tqdm(text_lines, desc="⏳ 生成嵌入")):
    vector = emb_text(line)
    data.append({"id": i, "vector": vector, "text": line})

# 7. 插入向量数据
milvus_client.insert(collection_name=collection_name, data=data)
print(f"✅ 向量插入完成，共 {len(data)} 条")


⏳ 生成嵌入: 100%|██████████| 387/387 [04:16<00:00,  1.51it/s]


✅ 向量插入完成，共 387 条


In [16]:
# 8. 检索函数（top_k 可调）
def retrieve_context(query, top_k=15):
    query_vector = embedding_model.encode([f"query: {query}"])[0]
    search_res = milvus_client.search(
        collection_name=collection_name,
        data=[query_vector],
        limit=top_k,
        search_params={"metric_type": "IP", "params": {}},
        output_fields=["text"]
    )
    return "\n".join([hit["entity"]["text"] for hit in search_res[0]])

# 9. 调试型问答函数（打印中间过程）
def answer_question_debug(question):
    context = retrieve_context(question, top_k=10)
    print("\n🔍 [Context 检索结果前500字]:\n", context[:500], "\n...")
    prompt = build_prompt(context, question)
    print("\n📜 [Prompt 构造]:\n", prompt)
    answer = generate_answer(prompt, tokenizer, model)
    print("\n🤖 [最终回答]:\n", answer)
    return answer


In [17]:
PROMPT = """
You are a helpful assistant with expert knowledge in sustainability. Your task is to answer the user's question using **only** the factual content provided in the context.

Avoid repeating the title or general phrases — extract meaningful, structured information when available.

<context>
{context}
</context>

<question>
{question}
</question>

Instructions:
- Answer clearly and concisely.
- Use **bullet points or numbered lists** when listing multiple facts or concepts.
- Do **not** add any information not found in the context.
- If the answer is **not** in the context, say: "Not found in the provided context."
"""


def build_prompt(context, question):
    return PROMPT.format(context=context, question=question)

In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"  # 可升级为 flan-t5-large
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_answer(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=512,
        do_sample=False,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

model.safetensors:  61%|######    | 1.90G/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
answer_question_debug("What is the purpose of the “Future of Sustainability” report?")



🔍 [Context 检索结果前500字]:
 The future of sustainability: Navigating trends and innovations for a sustainable tomorrow
6
 
Key findings & strategic 
recommendations 
The future of sustainability  report offers a detailed, forward-looking analysis 
that equips stakeholders with the knowledge and tools needed to navigate the 
complexities of the modern world. By proactively embracing these trends and 
recommendations, organisations can drive positive change and ensure long-
term success in a rapidly evolving sustainability l 
...

📜 [Prompt 构造]:
 
You are a helpful assistant with expert knowledge in sustainability. Your task is to answer the user's question using **only** the factual content provided in the context.

Avoid repeating the title or general phrases — extract meaningful, structured information when available.

<context>
The future of sustainability: Navigating trends and innovations for a sustainable tomorrow
6
 
Key findings & strategic 
recommendations 
The future of sustainab

'The Future of sustainability report provides a comprehensive analysis of the evolving landscape of sustainability. The study aims to forecast key trends and innovations that will shape the future, offering actionable insights and strategic recommendations for businesses, policymakers, and researchers'

In [12]:
answer_question_debug("How many sustainability trends are identified in the report?")



🔍 [Context 检索结果前500字]:
 The future of sustainability: Navigating trends and innovations for a sustainable tomorrow
5
 
0 /  Executive summary
The Future of sustainability report  provides a comprehensive analysis of the 
evolving landscape of sustainability. The study aims to forecast key trends and 
innovations that will shape the future, offering actionable insights and strategic 
recommendations for businesses, policymakers, and researchers.
Methodology & approach
Trend analysis:  The report identifies 87 sustainabi 
...

📜 [Prompt 构造]:
 
Use the information enclosed in <context> tags to answer the user's question.
Only use factual data found in the context — do not make up anything.

<context>
The future of sustainability: Navigating trends and innovations for a sustainable tomorrow
5
 
0 /  Executive summary
The Future of sustainability report  provides a comprehensive analysis of the 
evolving landscape of sustainability. The study aims to forecast key trends and 
innovations th

'Sustainable urbanisation and resilient infrastructure examines the development of climate-resilient cities, smart infrastructure, and sustainable urban planning to meet growing popu - lation needs'

In [None]:
answer_question_debug("What are the four key dimensions used in the trend radar?")


In [None]:
answer_question_debug("Name the five sustainability megatrends mentioned in the report.")


In [None]:
answer_question_debug("What role does green finance play in sustainability?")
