In [5]:
from dotenv import load_dotenv
import os

In [6]:
load_dotenv("../.env")

True

In [7]:
KEY = os.getenv("KEY")

### First part: Load the document(s) and split

In [144]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

In [145]:
loader = DirectoryLoader('./data/', loader_cls=PyPDFLoader, show_progress=True) 
pages = loader.load_and_split()

100%|██████████| 5/5 [21:09<00:00, 253.81s/it]


In [147]:
pg_no = 0
print(f"Content:\n {pages[pg_no].page_content[:200]} \nmetadata:\n {pages[pg_no].metadata}")
pg_no = -1
print(f"Content:\n {pages[pg_no].page_content[:200]} \nmetadata:\n {pages[pg_no].metadata}")

Content:
 1
CLIMATE CHANGE 2023
Synthesis Report
A Report of the Intergovernmental Panel on Climate Change 
metadata:
 {'source': 'data/IPCC_AR6_SYR_FullVolume.pdf', 'page': 0}
Content:
 and planetary health can be realised. Prospects for climate resilient development are increased by inclusive processes involving local 
knowledge and Indigenous Knowledge as well as processes that coo 
metadata:
 {'source': 'data/IPCC_AR6_WGII_SummaryForPolicymakers.pdf', 'page': 32}


### Create the embeddings for the splitted doc

In [1]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [2]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
persist_dir = "db"

In [174]:
db = Chroma.from_documents(pages, embedding_function, persist_directory=persist_dir)

In [151]:
db.persist()
db = None

In [3]:
db = Chroma(persist_directory=persist_dir, embedding_function=embedding_function)

In [5]:
query = "What are Observed Changes to Hazards and Extreme Events?"
docs = db.similarity_search(query)
for doc in docs:
    print(doc.metadata)

{'page': 1627, 'source': 'data/IPCC_AR6_WGI_FullReport.pdf'}
{'page': 1949, 'source': 'data/IPCC_AR6_WGII_FullReport.pdf'}
{'page': 1538, 'source': 'data/IPCC_AR6_WGI_FullReport.pdf'}
{'page': 219, 'source': 'data/IPCC_AR6_WGII_FullReport.pdf'}


### QA chain

In [4]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

In [8]:
llm_src = ChatOpenAI(temperature=0, model="gpt-4", openai_api_key=KEY)

In [9]:
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
The answer should be easy to be understood by anyone. Always consider the question to be asked related to climate change. Your source is IPCC reports.

{context}

Question: {question}
Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}

In [10]:
retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm_src,
    chain_type="stuff",
    retriever=db.as_retriever(
        search_kwargs={'k': 6}
    ),
    chain_type_kwargs=chain_type_kwargs,
    return_source_documents=True,
)

In [11]:
query = "How much CO2 a car amits per km?"
result = retrieval_qa({"query": query})

In [17]:
print(result["source_documents"][3].metadata)

{'page': 77, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}


In [213]:
sources = result["source_documents"]
for source in sources:
    print(source.metadata)

{'page': 1068, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
{'page': 59, 'source': 'data/IPCC_AR6_SYR_FullVolume.pdf'}
{'page': 243, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
{'page': 77, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
{'page': 319, 'source': 'data/IPCC_AR6_WGI_FullReport.pdf'}
{'page': 77, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}


In [185]:
docs = db.similarity_search(query, kwargs={'k':3})
docs2 = db.similarity_search_with_relevance_scores(query, k=10)
for doc in docs:
    print(doc.metadata)
for doc in docs2:
    print(doc)

{'page': 451, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
{'page': 754, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
{'page': 737, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
{'page': 653, 'source': 'data/IPCC_AR6_WGIII_FullReport.pdf'}
(Document(page_content='439\nMitigation and Development Pathways in the Near to Mid-term Chapter 44\nproject approvals and construction, and more stringent passive \nsafety measures, which increases the complexity of systems. After \nthe Fukushima Daiichi accident in Japan, nuclear programs in several \ncountries have been phased out or cancelled (Carrara 2020; Huenteler \net al. 2012; Kharecha and Sato 2019; Hoffman and Durlak 2018). \nAlso the compatibility of conventional prresurised water reactors and \nboiling water reactors with large proportion of renewable energy in \nthe grid it is yet to be fully understood.\nAccelerated mitigation scenarios offer contrasting views on the \nshare of nuclear in power generation. In the USA, (Victor e