In [1]:
"""
Susgen2024 for nus and A star.
Author: Xuan Wang
2024/05/26
"""

"""Get pdf info from json file, then copy to destinated directory from raw directory."""

import json
import os
import shutil

def get_pdf_list(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    pdf_list = [item['file'] for item in data if 'file' in item]
    return pdf_list

def copy_pdfs_to_destination(pdf_list, source_dir, destination_dir):
    dest_files = []

    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
    
    for pdf in pdf_list:
        src_path = os.path.join(source_dir, pdf)
        dest_path = os.path.join(destination_dir, pdf)
        dest_files.append(dest_path)
        if os.path.exists(dest_path):
            print(f"File {dest_path} already exsit.")
            continue

        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)
            print(f"Copied {src_path} to {dest_path}")
        else:
            print(f"File {src_path} does not exist")
    
    return dest_files


json_file_path = './jsons/qa_dict.json'
pdf_list = get_pdf_list(json_file_path)
# print(f"{len(pdf_list)} files: {pdf_list}")

source_dir = '../raw_data/raw_pdf/'
destination_dir = './pdfs/'
dest_files = copy_pdfs_to_destination(pdf_list, source_dir, destination_dir)
dest_files

File ./pdfs/Wolfspeed_2022_TCFD.pdf already exsit.
File ./pdfs/Siemens_2019_ESG.pdf already exsit.
File ./pdfs/Siemens_2020_ESG.pdf already exsit.
File ./pdfs/Prudential_2020_ESG.pdf already exsit.
File ./pdfs/Novartis_2020_ESG.pdf already exsit.
File ./pdfs/Novatek_2023_ESG.pdf already exsit.
File ./pdfs/HSBC_2022_TCFD.pdf already exsit.
File ./pdfs/Mizuho Financial Group_2020_TCFD.pdf already exsit.
File ./pdfs/NatWest_2020_TCFD.pdf already exsit.
File ./pdfs/CPP Investments_2020_ESG.pdf already exsit.
File ./pdfs/TD Bank_2020_TCFD.pdf already exsit.
File ./pdfs/TSMC_2022_TCFD.pdf already exsit.
File ./pdfs/Standard Chartered_2020_TCFD.pdf already exsit.
File ./pdfs/Nedbank_2020_TCFD.pdf already exsit.


['./pdfs/Wolfspeed_2022_TCFD.pdf',
 './pdfs/Siemens_2019_ESG.pdf',
 './pdfs/Siemens_2020_ESG.pdf',
 './pdfs/Prudential_2020_ESG.pdf',
 './pdfs/Novartis_2020_ESG.pdf',
 './pdfs/Novatek_2023_ESG.pdf',
 './pdfs/HSBC_2022_TCFD.pdf',
 './pdfs/Mizuho Financial Group_2020_TCFD.pdf',
 './pdfs/NatWest_2020_TCFD.pdf',
 './pdfs/CPP Investments_2020_ESG.pdf',
 './pdfs/TD Bank_2020_TCFD.pdf',
 './pdfs/TSMC_2022_TCFD.pdf',
 './pdfs/Standard Chartered_2020_TCFD.pdf',
 './pdfs/Nedbank_2020_TCFD.pdf']

In [None]:
import logging
import sys
import torch
from typing import List, Union, Tuple
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.core import Settings

torch.set_default_device('cuda')
# torch.set_default_device('cpu')
# ERROR to show only errors, INFO to show all logs
logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
# Set the log level for the sentence_transformers package
logging.getLogger('sentence_transformers').setLevel(logging.ERROR)
logging.getLogger().setLevel(logging.WARNING)

In [None]:
def load_llm_embed(llm_args: dict, embed_path: str) -> Tuple[HuggingFaceLLM, HuggingFaceEmbeddings]:
    llm = HuggingFaceLLM(**llm_args)
    embed_model = HuggingFaceEmbeddings(model_name=embed_path)
    return llm, embed_model

In [None]:
def load_documents(file_paths: Union[str, List[str]]) -> List:
    if isinstance(file_paths, str):
        documents = SimpleDirectoryReader(input_dir=file_paths).load_data()
    elif isinstance(file_paths, list):
        documents = SimpleDirectoryReader(input_files=file_paths).load_data()
    else:
        raise ValueError("Invalid input. Please provide a string or list of strings.")
    
    return documents

In [None]:
def rag_qa(index: VectorStoreIndex, query_str: str) -> str:
    query_engine = index.as_query_engine()
    response = query_engine.query(query_str)
    return response

In [None]:
system_prompt = (
"You are a Q&A assistant in financial domain. "
"Your goal is to answer questions as accurately as possible "
"based on the instructions and context provided."
)
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = "<|USER|>{query_str}<|ASSISTANT|>"
mistral_v2 = "/home/whatx/SusGen/ckpts/Mistral-7B-Instruct-v0.2-hf"
llama3 = "/home/whatx/SusGen/ckpts/Meta-Llama-3-8B-Instruct-hf"
llm_path = mistral_v2 # llama3
embed_model_path = "/home/whatx/SusGen/ckpts/all-mpnet-base-v2"

llm_args = {
    "system_prompt": system_prompt,
    "query_wrapper_prompt": query_wrapper_prompt,
    "device_map": "auto",
    "context_window": 5120,
    "max_new_tokens": 4096,
    "generate_kwargs": {"temperature": 0.1, "do_sample": True},
    "tokenizer_kwargs": {"max_length": 4096},
    "model_kwargs": {"torch_dtype": torch.float16},
    "model_name": llm_path,
    "tokenizer_name": llm_path,
}

# Load the LLM and Embedding model
llm, embed_model = load_llm_embed(llm_args, embed_model_path)

In [None]:
# candidates documents ~ Folder path or file list: 
docu_files = ["./pdfs/Wolfspeed_2022_TCFD.pdf"]
# docu_files = dest_files

documents_path = docu_files
text_chunking = 1024

# Load the documents
documents = load_documents(documents_path)
print(f"document: {documents}")

# Setting
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = text_chunking
index = VectorStoreIndex.from_documents(documents, settings=Settings)

# RAG_QA
# query_str = "What is the revenue of the company in 2020?"
query_str = "You are an expert in tcfd report, \
            please read the provided PDF document and summarize the company's introduction into a single paragraph of approximately 200 words. \
            Ensure the summary is concise, informative, and captures the essence of the company's identity and operations."
response = rag_qa(index, query_str)
print(response)

In [None]:
# RAG_QA
# query_str = "What is the revenue of the company in 2020?"
query_str = "You are an expert in tcfd report, \
            please read the provided PDF document and summarize the company's introduction into a single paragraph of approximately 200 words. \
            Ensure the summary is concise, informative, and captures the essence of the company's identity and operations."
response = rag_qa(index, query_str)
print(response)

In [None]:
# save the txt into json in the format of {files: <***.pdf>, summary: <intro of the company>, content: <qa pair>}

import json

def add_summaries_to_json(json_file_path, summaries):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    for entry in data:
        file_name = entry.get('file')
        if file_name in summaries:
            entry['summary'] = summaries[file_name]

    with open(json_file_path, 'w') as file:
        json.dump(data, file, indent=4)
    
    print("Summaries added successfully.")

json_file_path = './sum_qa_dict.json'  # JSON文件路径

# intro in dict
summaries = {
    "Wolfspeed_2022_TCFD.pdf": ,
    "Siemens_2019_ESG.pdf": ,
}

add_summaries_to_json(json_file_path, summaries)


In [None]:
# query_str = "As a specialist in the TCFD framework, elaborate on how to refine an organization\u2019s TCFD report, specifically addressing Governance, Strategy, Risk Management, and Metrics & Targets to better capture climate-related risks and opportunities.Answer the following questions: \n1. Disclose Scope 1, Scope 2, and, if appropriate, Scope 3 greenhouse gas (GHG) emissions, and the related risks.\n"
# response = rag_qa(index, query_str)
# print(response)