In [1]:
import logging
import sys
import torch
from typing import List, Union, Tuple
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.core import Settings

torch.set_default_device('cuda')
# ERROR to show only errors, INFO to show all logs
logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
# Set the log level for the sentence_transformers package
logging.getLogger('sentence_transformers').setLevel(logging.ERROR)
logging.getLogger().setLevel(logging.WARNING)

def load_llm_embed(llm_args: dict, embed_path: str) -> Tuple[HuggingFaceLLM, HuggingFaceEmbeddings]:
    llm = HuggingFaceLLM(**llm_args)
    embed_model = HuggingFaceEmbeddings(model_name=embed_path)
    return llm, embed_model

def load_documents(file_paths: Union[str, List[str]]) -> List:
    if isinstance(file_paths, str):
        documents = SimpleDirectoryReader(input_dir=file_paths).load_data()
    elif isinstance(file_paths, list):
        documents = SimpleDirectoryReader(input_files=file_paths).load_data()
    else:
        raise ValueError("Invalid input. Please provide a string or list of strings.")
    
    return documents

def rag_qa(index: VectorStoreIndex, query_str: str) -> str:
    query_engine = index.as_query_engine()
    response = query_engine.query(query_str)
    return response

In [2]:
system_prompt = (
"You are a Q&A assistant in financial domain. "
"Your goal is to answer questions as accurately as possible "
"based on the instructions and context provided."
)
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = "<|USER|>{query_str}<|ASSISTANT|>"
mistral_v2 = "/home/whatx/SusGen/ckpts/Mistral-7B-Instruct-v0.2-hf"
llama3 = "/home/whatx/SusGen/ckpts/Meta-Llama-3-8B-Instruct-hf"
llm_path = mistral_v2 # llama3
embed_model_path = "/home/whatx/SusGen/ckpts/all-mpnet-base-v2"
# candidates documents ~ Folder path or file list: 
docu_files = ["/home/whatx/SusGen/data/raw_data/raw_pdf/3M_2020_ESG.pdf"]
docu_folder = "RAG/data"
documents_path = docu_files
text_chunking = 1024

llm_args = {
    "system_prompt": system_prompt,
    "query_wrapper_prompt": query_wrapper_prompt,
    "device_map": "auto",
    "context_window": 5120,
    "max_new_tokens": 4096,
    "generate_kwargs": {"temperature": 0.1, "do_sample": True},
    "tokenizer_kwargs": {"max_length": 4096},
    "model_kwargs": {"torch_dtype": torch.float16},
    "model_name": llm_path,
    "tokenizer_name": llm_path,
}

# Load the LLM and Embedding model
llm, embed_model = load_llm_embed(llm_args, embed_model_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [8]:
# Load the documents
documents = load_documents(documents_path)

# Setting
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = text_chunking
index = VectorStoreIndex.from_documents(documents, settings=Settings)

In [5]:
# RAG_QA
# query_str = "What is the revenue of the company in 2020?"
query_str = "You are an expert in tcfd report, please describe the climate-related risks and opportunities of this company."
response = rag_qa(index, query_str)
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Based on the context provided in the 3M 2020 Sustainability Report, the company identifies several climate-related risks and opportunities.

Risks:
1. Severe weather and long-term trends could impact 3M's operations and supply chain. The company addresses these risks through a formal Enterprise Risk Management Analysis, a Business Continuity and Planning Process, and a Crisis Management Program.
2. Natural disasters, infectious diseases, employee safety, protection of assets, customer service, and other business continuity requirements are also considered in their risk management plans.
3. Climate change and energy conservation present challenges that could affect the company's operations and the communities it serves.

Opportunities:
1. 3M is a diversified technology company providing innovative solutions to address climate change and energy conservation.
2. Access to dependable energy supplies and energy efficiency are crucial for all businesses and communities, and 3M's operations b

In [9]:
query_str = "As a specialist in the TCFD framework, elaborate on how to refine an organization\u2019s TCFD report, specifically addressing Governance, Strategy, Risk Management, and Metrics & Targets to better capture climate-related risks and opportunities.Answer the following questions: \n1. Disclose Scope 1, Scope 2, and, if appropriate, Scope 3 greenhouse gas (GHG) emissions, and the related risks.\n"
response = rag_qa(index, query_str)
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


To refine an organization's TCFD report, it is essential to provide a detailed and transparent disclosure of Scope 1, Scope 2, and, if appropriate, Scope 3 greenhouse gas (GHG) emissions and the related risks. The organization should also establish clear and comparable metrics and targets related to climate-related risks and opportunities.

Regarding governance, the organization should describe its governance structure and processes related to climate risk management. This includes information on the board of directors' role in overseeing climate risk, the existence of a climate risk management policy, and the responsibilities of key personnel. The organization should also disclose how it integrates climate risk into its decision-making processes, such as through the use of scenario analysis or stress testing.

In terms of risk management, the organization should outline its climate-related risks and opportunities and how they are managed. Climate-related risks can include physical ris