In [2]:
from langchain.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import WebBaseLoader

# Load the web page content
loader = WebBaseLoader("https://www.eia.gov/tools/glossary/")
documents = loader.load()

# Initialize Ollama LLM and embeddings
llm = Ollama(model="mistral:7b-instruct")
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Create the question-answering chain
chain = load_qa_chain(llm, chain_type="stuff")

# Ask a question
query = """What are all the terms and definitions
present on all the glossary pages of the website? The pattern is term:definition. For example, ACBM:  Acronym for asbestos-containing building material."""
answer = chain.run(input_documents=documents, question=query)

print(answer)


 Here are the terms and their respective definitions from the provided list:

1. Average production per miner per hour: The ratio of the total production at a mining operation to the total direct labor hours worked at the operation.
2. Average Recovery Percentage (coal): The percentage of coal that can be recovered from known coal reserves at reporting mines, weight averaged for all mines in the reported geographic area.
3. Average revenue per kilowatthour: The average revenue per kilowatthour of electricity sold by sector (residential, commercial, industrial, or other) and geographic area (State, Census division, and national) is calculated by dividing the total monthly revenue by the corresponding total monthly sales for each sector and geographic area.
4. Average stream flow: The rate, usually expressed in cubic feet per second, at which water passes a given point in a stream over a set period of time.
5. Average vehicle fuel consumption: A ratio estimate defined as total gallons of

In [3]:
from langchain.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import WebBaseLoader
import re
import pandas as pd

# Load the web page content
loader = WebBaseLoader("https://www.eia.gov/tools/glossary/")
documents = loader.load()

# Create a function to extract terms and definitions
def extract_term_definitions(text):
    terms = []
    # Regular expression to find "term: meaning" patterns (more flexible)
    pattern = r'(\w+[\w\s]*[^:]+):\s*([^\n]+)'
    matches = re.finditer(pattern, text)
    for match in matches:
        term = match.group(1).strip()
        definition = match.group(2).strip()
        terms.append((term, definition))
    return terms

# Extract terms and definitions from all documents
all_terms = []
for doc in documents:
    terms = extract_term_definitions(doc.page_content)
    all_terms.extend(terms)

# Convert the list of tuples to a DataFrame
df = pd.DataFrame(all_terms, columns=["Term", "Definition"])

# Save to an Excel file
df.to_excel("glossary_terms.xlsx", index=False)

print("All terms and definitions saved to glossary_terms.xlsx")


All terms and definitions saved to glossary_terms.xlsx
