# Setup

In [2]:
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Data Loading

In [4]:
from langchain.document_loaders import PyPDFLoader

file_path = "./data/SDG.pdf"
loader = PyPDFLoader(file_path)
data = loader.load()
data

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2017-12-26T16:03:25-05:00', 'moddate': '2017-12-27T15:29:10-05:00', 'trapped': '/False', 'source': './data/SDG.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2017-12-26T16:03:25-05:00', 'moddate': '2017-12-27T15:29:10-05:00', 'trapped': '/False', 'source': './data/SDG.pdf', 'total_pages': 24, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign CC 13.0 (Macintosh)', 'creationdate': '2017-12-26T16:03:25-05:00', 'moddate': '2017-12-27T15:29:10-05:00', 'trapped': '/False', 'source': './data/SDG.pdf', 'total_pages': 24, 'page': 2, 'page_label': '3'}, page_content='IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD \nCAME TOGETHER TO 

In [5]:
len(data)

24

# Data Extraction

In [6]:
qustion_gen = ""
for page in data:
    qustion_gen += page.page_content + "\n" 
print(qustion_gen)  # Print the first 1000 characters to check the content



IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD 
CAME TOGETHER TO FACE THE FUTURE.
And what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. 
Not just in some faraway place, but in their own cities and towns and villages.
They knew things didn’t have to be this way. They knew we had enough 
food to feed the world, but that it wasn’t getting shared. They knew there 
were medicines for HIV and other diseases, but they cost a lot. They knew 
that earthquakes and floods were inevitable, but that the high death 
tolls were not. 
They also knew that billions of people worldwide shared their hope for a 
better future.
So leaders from these countries created a plan called the Sustainable 
Development Goals (SDGs). This set of 17 goals imagines a future just 15 years 
off that would be rid of poverty and hunger, and safe from the worst effects of 
climate change. It’s an ambitious plan. 
But there’s ample evidence that we can succeed. In the past 15 years, the 
inter

# Splitting Data into Chunks

In [7]:
from langchain.text_splitter import TokenTextSplitter

In [10]:
splitter_ques_gen = TokenTextSplitter(
    model_name="gpt-4o-mini",
    chunk_size=10000,
    chunk_overlap=200,
)

In [11]:
chunks_ques_gen = splitter_ques_gen.split_text(qustion_gen)

In [12]:
chunks_ques_gen

['\n\nIN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD \nCAME TOGETHER TO FACE THE FUTURE.\nAnd what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. \nNot just in some faraway place, but in their own cities and towns and villages.\nThey knew things didn’t have to be this way. They knew we had enough \nfood to feed the world, but that it wasn’t getting shared. They knew there \nwere medicines for HIV and other diseases, but they cost a lot. They knew \nthat earthquakes and floods were inevitable, but that the high death \ntolls were not. \nThey also knew that billions of people worldwide shared their hope for a \nbetter future.\nSo leaders from these countries created a plan called the Sustainable \nDevelopment Goals (SDGs). This set of 17 goals imagines a future just 15 years \noff that would be rid of poverty and hunger, and safe from the worst effects of \nclimate change. It’s an ambitious plan. \nBut there’s ample evidence that we can succeed. In the past 1

In [13]:
len(chunks_ques_gen)

1

In [14]:
from langchain.docstore.document import Document
documents_ques_gen = [Document(page_content=chunk) for chunk in chunks_ques_gen]

In [15]:
type(documents_ques_gen[0])

langchain_core.documents.base.Document

In [16]:
splitter_ans_gen = TokenTextSplitter(
    model_name="gpt-4o-mini",
    chunk_size=1000,
    chunk_overlap=100,
)

In [17]:
documents_ans_gen = splitter_ans_gen.split_documents(documents_ques_gen)
len(documents_ans_gen)

4

# Question Generating

In [27]:
from langchain_openai import ChatOpenAI

llm_ques_gen_pipeline = ChatOpenAI(
    model="openai/gpt-4o-mini",
    base_url="https://models.github.ai/inference",
    temperature=0.3,
)

In [28]:
prompt_template = """
You are an expert in generating questions based on the provided text.
Your goal is to prepare a person for a interview by generating relevant questions.
You do this by asking questions that are relevant to the text provided.

--------------
{text}
--------------

Create a list of 10 questions based on the text above.
Make sure not to lose any important information.

Questions:
"""


In [29]:
from langchain.prompts import PromptTemplate
prompt_questions = PromptTemplate(
    input_variables=["text"],
    template=prompt_template,
)

In [41]:
refine_template = ("""
You are an expert in generating questions based on the provided text.
Your goal is to prepare a person for a interview by generating relevant questions.
We have received some practice questions to a certain extent: {existing_answer}
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below

--------------
{text}
--------------

Give the new content with 10 questions only, refine the original questions in English.
If the context is not helpful, please provide the original questions.

Questions:

""")

In [42]:
refine_prompt_questions = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

In [43]:
from langchain.chains.summarize import load_summarize_chain
question_gen_chain = load_summarize_chain(
    llm=llm_ques_gen_pipeline,
    chain_type="refine",
    verbose=True,
    question_prompt=prompt_questions,
    refine_prompt=refine_prompt_questions,
)

In [44]:
ques = question_gen_chain.run(documents_ans_gen)
print(ques)  # Print the generated questions



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert in generating questions based on the provided text.
Your goal is to prepare a person for a interview by generating relevant questions.
You do this by asking questions that are relevant to the text provided.

--------------


IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD 
CAME TOGETHER TO FACE THE FUTURE.
And what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. 
Not just in some faraway place, but in their own cities and towns and villages.
They knew things didn’t have to be this way. They knew we had enough 
food to feed the world, but that it wasn’t getting shared. They knew there 
were medicines for HIV and other diseases, but they cost a lot. They knew 
that earthquakes and floods were inevitable, but that the high death 
tolls were not. 
They also knew that billions of people worldwide shared their ho

# Vector Embeddings

In [45]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model = "text-embedding-3-small",
    openai_api_base = "https://models.inference.ai.azure.com",
)

In [46]:
from langchain.vectorstores import FAISS

In [47]:
vectorstore = FAISS.from_documents(
    documents=documents_ans_gen,
    embedding=embeddings,
)

# Answer Generation

In [56]:
llm_answer_gen = ChatOpenAI(
    model="openai/gpt-4o",
    base_url="https://models.github.ai/inference",
    temperature=0.1,
)

In [49]:
ques

'Based on the new content provided, here are 10 refined questions that incorporate the themes of environmental sustainability, peace, and global cooperation related to the Sustainable Development Goals (SDGs):\n\n1. How do the Sustainable Development Goals (SDGs) address the issue of ocean health, particularly in relation to carbon dioxide levels and plastic pollution?\n\n2. What specific targets do the SDGs set for the protection and sustainable use of terrestrial ecosystems, and why is this important for human survival and biodiversity?\n\n3. Can you discuss the alarming trends related to land degradation and biodiversity loss, and how the SDGs aim to combat these issues by 2030?\n\n4. In what ways do peace and justice contribute to sustainable development, and what measures do the SDGs propose to promote these values globally?\n\n5. How can strengthening the rule of law and reducing violence contribute to a country’s development, according to the principles outlined in the SDGs?\n\n

In [52]:
ques_cleaned = ques.split("\n\n")[1:-1]
ques_cleaned

['1. How do the Sustainable Development Goals (SDGs) address the issue of ocean health, particularly in relation to carbon dioxide levels and plastic pollution?',
 '2. What specific targets do the SDGs set for the protection and sustainable use of terrestrial ecosystems, and why is this important for human survival and biodiversity?',
 '3. Can you discuss the alarming trends related to land degradation and biodiversity loss, and how the SDGs aim to combat these issues by 2030?',
 '4. In what ways do peace and justice contribute to sustainable development, and what measures do the SDGs propose to promote these values globally?',
 '5. How can strengthening the rule of law and reducing violence contribute to a country’s development, according to the principles outlined in the SDGs?',
 '6. What role does global cooperation play in achieving the Sustainable Development Goals, and how can countries work together to implement these ambitious targets?',
 '7. How does the interconnectedness of 

In [57]:
from langchain.chains import RetrievalQA
answer_generation_chain = RetrievalQA.from_chain_type(
    llm=llm_answer_gen,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [58]:
# Answer each question and save to a file

for question in ques_cleaned:
    answer = answer_generation_chain.run(question)
    with open("./data/answers.txt", "a") as f:
        f.write(f"Q: {question}\nA: {answer}\n\n")

In [60]:
# Load and print the answers from the file
with open("./data/answers.txt", "r") as f:
    answers = f.read()
print(answers)  # Print the answers to check the content

Q: 1. How do the Sustainable Development Goals (SDGs) address the issue of ocean health, particularly in relation to carbon dioxide levels and plastic pollution?
A: The Sustainable Development Goals (SDGs) address the issue of ocean health through **Goal 14: Conserve and sustainably use the oceans, seas, and marine resources for sustainable development**. This goal highlights the critical role oceans play in supporting life on Earth and outlines specific challenges and solutions related to ocean health, particularly in relation to carbon dioxide levels and plastic pollution:

1. **Carbon Dioxide Levels and Ocean Acidification**:
   - Oceans absorb about 30% of the carbon dioxide (CO2) produced by human activities. However, the increasing levels of CO2 emissions are causing ocean acidification, which has risen by 26% since the start of the industrial revolution. This acidification threatens marine ecosystems, particularly coral reefs and shellfish populations.
   - The SDGs emphasize th