In [5]:
import os #Imports the built-in os module so you can interact with environment variables
from dotenv import load_dotenv #This function reads key-value pairs from a .env file and adds them to your environment variables.
load_dotenv() #Loads environment variables from a .env file into your environment.

# Confirm it's loaded
#Retrieves the OPENAI_API_KEY from the environment.
#Converts it to a Boolean (True if it exists --> False otherwise).
#Prints Key loaded: True if the key is successfully loaded.
print("Key loaded:", bool(os.getenv("OPENAI_API_KEY"))) 

Key loaded: True


In [8]:
from langchain.document_loaders import PyPDFLoader #A class used to load and parse content from PDF files.
from langchain.text_splitter import RecursiveCharacterTextSplitter # A utility that splits long texts into smaller chunks using intelligent logic, useful for embedding and search.

# Load the PDF document
loader = PyPDFLoader("COMMUNITY+ARCHETYPES_Final.08.22.24.pdf") #Initializes the loader with your target PDF.
documents = loader.load() #.load() Reads and extracts the text content from each page of the PDF. --> Langchain document object

#Each chunk will contain approximately 500 characters
#Consecutive chunks will overlap by 50 characters to maintain context across chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) 

#Takes the list of PDF Document objects and splits the content into smaller chunks.
#A new list of smaller Document chunks, good for embedding, vector search, or feeding into LLMs.
docs = splitter.split_documents(documents)

Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
Ignoring wrong pointing object 92 0 (offset 0)
Ignoring wrong pointing object 116 0 (offset 0)
Ignoring wrong pointing object 118 0 (offset 0)
Ignoring wrong pointing object 131 0 (offset 0)
Ignoring wrong pointing object 137 0 (offset 0)
Ignoring wrong pointing object 139 0 (offset 0)
Ignoring wrong pointing object 141 0 (offset 0)
Ignoring wrong pointing object 143 0 (offset 0)
Ignoring wrong pointing object 152 0 (offset 0)


In [9]:
from langchain.embeddings import OpenAIEmbeddings #generate vector embeddings (numerical resprentations of the docs)

#embeddings are key to enabling semantic search
embeddings = OpenAIEmbeddings() #Create an instance of OpenAI embedding generator

#Create a list of strings representing the content of the chunks created from the pdf
#embed_documnets --> sends each text chunk to OpenAI;s embedding API's
doc_embeddings = embeddings.embed_documents([doc.page_content for doc in docs]) #creates a list of embeddings (vectors) to store in FAISS

In [11]:
#Using FAISS (fast and simple in-memory search engine) (Facebook AI Simularity search)

In [13]:
from langchain.vectorstores import FAISS #import FAISS - high-performance vector database

#inout the docs and embeddings we created
#Embeds all docs using embedding model
#Stores in FAISS Index -- to use for a semantic search
vectorstore = FAISS.from_documents(docs,embeddings)

In [14]:
query = 'What is High O&G Production?'
docs = vectorstore.similarity_search(query,k=3) #pass query to vector store -> vectorstore compares this vector to all stored doc vectors --> k=3 returns the top 3 most similar document chunks based on meaning

for i,doc in enumerate(docs):
    print(f'\n-- Match {i+1}--')
    print(doc.page_content) #doc.page_content contains the raw text of each matched chunk.


-- Match 1--
High oil and gas production (4) High renewable energy capacity (8) Very small populations and population loss (17) High % of residents <HS education (9) High unemployment and High % <HS education (26)    Small population, high GDP and O&G production (1)  Very high population gain (1)

-- Match 2--
• Exceptionally high O&G production 4.5 – 7.4 billion BOE/year • Very low to average installed RE capacity 0.0 – 159 MW • Average to high employment in natural  resources, construction, and maintenance 14 – 27% • Average to high percent of residents  without a HS diploma 15 – 30% • Average to very high unemployment rates 5.7 – 9.6% • Average to high GDP per capita $142 thousand – $1.3 million High RE Capacity 8 • Exceptionally high installed RE capacity 970 – 2,800 MW/year • Low to average

-- Match 3--
(BOE) remaining—“twice the volume produced during the first 100 years of hydrocarbon production” (p. 1099). Direct O&G jobs doubled between 2009 and 2019 to toal >87,600.However,

In [15]:
from langchain.chat_models import ChatOpenAI #import Langchain wrapper around OpenAI's GT model, whichwe call in the below code
from langchain.chains.question_answering import load_qa_chain #prebuilt QA chain in Langchain --> takes the query and docs and feeds to LLM for an answer

In [18]:
llm = ChatOpenAI(model_name='gpt-4',temperature=0.2) #model we use GPT-4 and tempartre is 0.2 to minimize randomness

qa_chain = load_qa_chain(llm,chain_type='stuff')#'stuff': Concatenates all documents and passes them as a single input to the model.

response = qa_chain.run(input_documents=docs,question=query) #The query and top-matching chunks for FAISS vectorestore
print('\nAnswer:\n',response) # A natural language answer generated by GPT-4


Answer:
 High Oil and Gas (O&G) production refers to a significant level of extraction and processing of oil and natural gas. In the context provided, it is quantified as 4.5 – 7.4 billion Barrels of Oil Equivalent (BOE) per year. This indicates a high level of activity in the oil and gas sector, contributing to the energy supply and economy of the region.
