In [6]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 加载PDF文件（请确保文件路径正确）
loader = PyPDFLoader("WHITEPAPER_Future_of_Sustainability_2025.pdf")
docs = loader.load()

# 使用文本切分器进行切分
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

# 提取每个块的文本内容
text_lines = [chunk.page_content for chunk in chunks]

# 打印前几个文本块以验证结果
for i, text in enumerate(text_lines[:3]):
    print(f"块 {i+1}:\n{text}\n{'-'*40}\n")



块 1:
The future of 
sustainability
Navigating trends 
and innovations for 
a sustainable tomorrow
FEBRUARY 2025 | MICHAEL HANF, LEAD SUSTAINABLE BUSINESS, VTT
----------------------------------------

块 2:
Michael Hanf (2025), The future of sustainability - Navigating 
trends and innovations for a sustainable tomorrow, 
VTT Technical Research Centre of Finland, Espoo, Finland.
Author: Michael Hanf
Contributors: Maria Akerman, Sajad Ashouri, Arash Hajikhani, 
Kalle Kantola, Tiina Koljonen, Sofi Kurki, Annu Markkula, 
Maaria Nuutinen, Hanna Pihkola, Antti-Jussi Tahvanainen, 
Nina Wessberg
For enquiries, please contact the author, Michael Hanf, 
at michael.hanf@vtt.fi
© VTT Technical Research Centre of Finland, 2025
----------------------------------------

块 3:
The future of sustainability: 
Navigating trends and innovations  
for a sustainable tomorrow
0/ Executive summary  . . . . . . . . . . . . . . . . . . . . . . . 5
Methodology & approach . . . . . . . . . . . . . . . . . . . . . .

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

# 加载预训练的嵌入模型
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 为每个文本块生成嵌入
embeddings = embedding_model.encode(text_lines)

# 检查生成的嵌入维度和部分结果
print("嵌入的形状:", np.array(embeddings).shape)
print("第一条嵌入示例:", embeddings[0][:10])  # 打印第一条嵌入的前10个数值



嵌入的形状: (387, 384)
第一条嵌入示例: [ 0.02845221  0.02056801  0.05199016 -0.03178968  0.04272494  0.01241189
 -0.10353621 -0.0057232  -0.05912128  0.01031158]


In [19]:
from pymilvus import connections

connections.connect(host="localhost", port="19530")
print("成功连接到 Milvus!")


成功连接到 Milvus!


In [None]:
from pymilvus import MilvusClient

# 初始化客户端（这里不再用 Collection，而是 MilvusClient）
milvus_client = MilvusClient(uri="http://localhost:19530")

collection_name = "rag_collection"

# 删除旧集合（如果存在）
if collection_name in milvus_client.list_collections():
    milvus_client.drop_collection(collection_name)
    print(f"⚠️ 旧集合 {collection_name} 已删除")

# 创建新集合
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=384,  # 嵌入向量维度
    metric_type="IP",  # Inner Product
    consistency_level="Strong"
)
print(f"✅ 创建集合 {collection_name} 成功")


In [None]:
# 定义用于嵌入的函数
def emb_text(text):
    return embedding_model.encode([text])[0]  # 取第一个向量

# 构造数据列表
data = []
for i, line in enumerate(text_lines):
    data.append({
        "id": i,
        "vector": emb_text(line),
        "text": line
    })

# 插入数据
insert_res = milvus_client.insert(
    collection_name=collection_name,
    data=data
)
print("✅ 插入完成，数量：", len(data))

In [None]:
def retrieve_context(query, top_k=10):
    query_vector = emb_text(query)
    search_res = milvus_client.search(
        collection_name=collection_name,
        data=[query_vector],
        limit=top_k,
        search_params={"metric_type": "IP", "params": {}},
        output_fields=["text"]
    )

    results = [hit["entity"]["text"] for hit in search_res[0]]
    return "\n".join(results)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
PROMPT = """
Use the information enclosed in <context> tags to answer the user's question.
Only use factual data found in the context — do not make up anything.

<context>
{context}
</context>

<question>
{question}
</question>

Answer with a clear list of bullet points if appropriate. Respond "Not found in the provided context" if the context doesn't contain the answer.
"""




def build_prompt(context, question):
    return PROMPT.format(context=context, question=question)

In [None]:
def generate_answer(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def answer_question(question):
    context = retrieve_context(question)
    prompt = build_prompt(context, question)
    answer = generate_answer(prompt, tokenizer, model)
    return answer

In [None]:
question = "What is the 2030 emission reduction target mentioned in the decarbonisation section?"
context = retrieve_context(question)

print("📚 Context:\n", context)
prompt = build_prompt(context, question)
print("🧾 Prompt:\n", prompt)

answer = generate_answer(prompt, tokenizer, model)
print("🤖 Answer:", answer)