In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load PDF file
loader = PyPDFLoader("WHITEPAPER_Future_of_Sustainability_2025.pdf")
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1800, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

# Extract the text content of each block
text_lines = [chunk.page_content for chunk in chunks]

# Print
for i, text in enumerate(text_lines[:3]):
    print(f"chunk {i+1}:\n{text}\n{'-'*40}\n")

chunk 1:
The future of 
sustainability
Navigating trends 
and innovations for 
a sustainable tomorrow
FEBRUARY 2025 | MICHAEL HANF, LEAD SUSTAINABLE BUSINESS, VTT
----------------------------------------

chunk 2:
Michael Hanf (2025), The future of sustainability - Navigating 
trends and innovations for a sustainable tomorrow, 
VTT Technical Research Centre of Finland, Espoo, Finland.
Author: Michael Hanf
Contributors: Maria Akerman, Sajad Ashouri, Arash Hajikhani, 
Kalle Kantola, Tiina Koljonen, Sofi Kurki, Annu Markkula, 
Maaria Nuutinen, Hanna Pihkola, Antti-Jussi Tahvanainen, 
Nina Wessberg
For enquiries, please contact the author, Michael Hanf, 
at michael.hanf@vtt.fi
© VTT Technical Research Centre of Finland, 2025
----------------------------------------

chunk 3:
The future of sustainability: 
Navigating trends and innovations  
for a sustainable tomorrow
0/ Executive summary  . . . . . . . . . . . . . . . . . . . . . . . 5
Methodology & approach . . . . . . . . . . . . . . . .

In [16]:
from sentence_transformers import SentenceTransformer
from pymilvus import MilvusClient
import numpy as np
from tqdm import tqdm

# Embedding model
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# Define Embedding model
def emb_text(text):
    return embedding_model.encode([f"passage: {text}"])[0]

# Get the dimension
vector_dim = emb_text("test input").shape[0]
print(f"The current dimension of embedding: {vector_dim}")

# Connect to Milvus and deal with collection
milvus_client = MilvusClient(uri="http://localhost:19530")
collection_name = "rag_collection"

if collection_name in milvus_client.list_collections():
    milvus_client.drop_collection(collection_name)
    print(f"Delate the old collection {collection_name}")

milvus_client.create_collection(
    collection_name=collection_name,
    dimension=vector_dim,
    metric_type="IP",
    consistency_level="Strong"
)
print(f"Create a new collection {collection_name} successfully")

The current dimension of embedding: 768
Delate the old collection rag_collection
Create a new collection rag_collection successfully


In [17]:
# Generate vectors and insert 
data = []
for i, line in enumerate(tqdm(text_lines, desc="⏳ Generate embeddings")):
    vector = emb_text(line)
    data.append({"id": i, "vector": vector, "text": line})

milvus_client.insert(collection_name=collection_name, data=data)
print(f"The vector insertion is complete. There are {len(data)}.")


⏳ Generate embeddings: 100%|██████████| 215/215 [01:54<00:00,  1.88it/s]


The vector insertion is complete. There are 215.


In [60]:
# Retrieve context function
def retrieve_context(query, top_k=15):
    query_vector = embedding_model.encode([f"query: {query}"])[0]
    search_res = milvus_client.search(
        collection_name=collection_name,
        data=[query_vector],
        limit=top_k,
        search_params={"metric_type": "IP", "params": {}},
        output_fields=["text"]
    )
    return "\n".join([hit["entity"]["text"] for hit in search_res[0]])


PROMPT = """
You are a helpful assistant with expert knowledge in sustainability. Your task is to answer the user's question using **only** the factual content provided in the context.

Avoid repeating the title or general phrases — extract meaningful, structured information when available.

<context>
{context}
</context>

<question>
{question}
</question>

Instructions:
- Answer clearly and concisely.
- Use **bullet points or numbered lists** when listing multiple facts or concepts.
- Do **not** add any information not found in the context.
- If the answer is **not** in the context, say: "Not found in the provided context."

"""

def build_prompt(context, question):
    return PROMPT.format(context=context, question=question)


In [None]:
PROMPT = """
### ROLE ###
You are a sustainability expert AI assistant that ONLY responds based on verified information.

### INSTRUCTION STEPS ###
1. STRICTLY ANALYZE the user's question type:
   - Factual question (what/where/when/who)
   - Explanatory question (why/how)
   - Significance question (importance/impact)

2. CONTEXT PROCESSING:
   Read the provided context EXACTLY as written. 
   Identify SPECIFIC sentences that DIRECTLY relate to the question.

3. RESPONSE GENERATION RULES:
   [REQUIRED] Answer MUST be grounded in explicit context statements
   [REQUIRED] For "why" questions: 
       - Only explain reasons EXPLICITLY stated in context
       - If no causation mentioned, use predefined response
   [REQUIRED] When information is:
       a) Fully available → Concise 1-3 sentence answer
       b) Partially related → "I cannot find..." response
       c) Missing → "I cannot find..." response
   [PROHIBITED] Never:
       - Assume unstated connections
       - Combine information from different sections
       - Use examples not in context

### FORMAT CONTROL ###
- No markdown of any kind
- Avoid transitional phrases ("However", "Additionally")
- Use bullet points ONLY when listing explicit items from context

### CONTEXT ###
{context}

### QUESTION ###
{question}

### SAFETY PROTOCOL ###
If uncertain about ANY part of the response, immediately fallback to: 
"I cannot find relevant information in the provided documents."

### RESPONSE ###
"""

def build_prompt(context: str, question: str) -> str:
    sanitized_context = " ".join(context.strip().splitlines())
    sanitized_question = question.strip().replace("\n", " ")
    
    max_context_length = 3000
    truncated_context = (sanitized_context[:max_context_length] + "...") if len(sanitized_context) > max_context_length else sanitized_context
    
    return PROMPT.format(context=truncated_context, question=sanitized_question)

In [55]:
# Generate answers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_answer(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=512,
        do_sample=False,        
        num_beams=4,            
        early_stopping=True   
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Question and answer
def answer_question_debug(question):
    context = retrieve_context(question, top_k=10)
    print("\n🔍 [Context]:\n", context[:500], "\n...")
    prompt = build_prompt(context, question)
    print("\n📜 [Prompt]:\n", prompt)
    answer = generate_answer(prompt, tokenizer, model)
    print("\n🤖 [Answer]:\n", answer)
    return answer

#### Answer questions

In [64]:
answer_question_debug("What is the role of artificial intelligence in sustainability?")


🔍 [Context]:
 547-566. doi: 10.1108/AAAJ-04-2021-5233.
UNEP (2022). How artificial intelligence is helping tackle 
environmental challenges. United Nations Environ -
mental Programme website, accessed 9.1.2025.
Sustainability megatrends 
Adger WN, Jordan A, eds. (2009). Governing Sustaina -
bility. Cambridge University Press; 2009.
Aminetzah D., Katz J., Mannion P . (2020). Feeding the 
world sustainably. McKinsey Quarterly, June 2020.
Araújo, O.Q.F ., de Medeiros, J.L. (2022). Sustainable 
and equitable deca 
...

📜 [Prompt]:
 
### ROLE ###
You are a sustainability expert AI assistant that ONLY responds based on verified information.

### INSTRUCTION STEPS ###
1. STRICTLY ANALYZE the user's question type:
   - Factual question (what/where/when/who)
   - Explanatory question (why/how)
   - Significance question (importance/impact)

2. CONTEXT PROCESSING:
   Read the provided context EXACTLY as written. 
   Identify SPECIFIC sentences that DIRECTLY relate to the question.

3. RESPONSE 

'How artificial intelligence is helping tackle environmental challenges. United Nations Environ - mental Programme website, accessed 9.1.2025. Sustainability megatrends'

In [None]:
answer_question_debug("What is the purpose of the “Future of Sustainability” report?")



🔍 [Context]:
 The future of sustainability: Navigating trends and innovations for a sustainable tomorrow
10
 
The conclusion  synthesizes the report’s findings, highlighting the intercon -
nected nature of trends and megatrends while offering forward-looking per -
spectives for businesses, policymakers, and organisations. It underscores the 
need for collaboration, innovation, and systemic change to address sustaina -
bility challenges and seize new opportunities.
Finally, the appendices  provide supporting i 
...

📜 [Prompt]:
 
You are a helpful assistant with expert knowledge in sustainability. Your task is to answer the user's question using **only** the factual content provided in the context.

Avoid repeating the title or general phrases — extract meaningful, structured information when available.

<context>
The future of sustainability: Navigating trends and innovations for a sustainable tomorrow
10
 
The conclusion  synthesizes the report’s findings, highlighting the intercon -

'European perspective that shapes much of this analysis, the study focuses on insights relevant to regions with advanced economies and established regulatory frameworks'

In [66]:
answer_question_debug("What are some examples of green finance mentioned in the report?")


🔍 [Context]:
 resources.54
3. Financial systems for sustainable growth:  The transformation of finan -
cial markets is central to this new model. Sustainable finance (1.1.4) and 
the rise of green finance (1.2.1) are encouraging investors to prioritise ESG 
performance, pushing capital towards sustainable industries and projects. 
Green bonds, impact investing, and sustainability-linked loans are exam -
ples of how finance is evolving to support businesses that contribute to cli-
mate resilience, social inclu 
...

📜 [Prompt]:
 
### ROLE ###
You are a sustainability expert AI assistant that ONLY responds based on verified information.

### INSTRUCTION STEPS ###
1. STRICTLY ANALYZE the user's question type:
   - Factual question (what/where/when/who)
   - Explanatory question (why/how)
   - Significance question (importance/impact)

2. CONTEXT PROCESSING:
   Read the provided context EXACTLY as written. 
   Identify SPECIFIC sentences that DIRECTLY relate to the question.

3. RESPONSE 

'Green bonds, impact investing, and sustainability-linked loans are exam - ples of how finance is evolving to support businesses that contribute to cli- mate resilience, social inclusion, and environmental restoration.'