In [2]:
from langchain_chroma import Chroma
from transformers import AutoTokenizer
from langchain_ibm import ChatWatsonx, WatsonxLLM, WatsonxEmbeddings
from langchain_text_splitters import MarkdownTextSplitter
import os
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from app.utils import *
from langchain_core.documents import Document
from dotenv import load_dotenv
load_dotenv()
PAGE_BREAK = "<!-- page break -->"

file_paths = [ file for file in os.listdir("app/data/climate_edu") if file.endswith(".pdf") ]
tokenizer = AutoTokenizer.from_pretrained(
    "ibm-granite/granite-embedding-278m-multilingual"
)
max_length = tokenizer.model_max_length - 10

markdown_splitter = MarkdownTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer,
    chunk_size=max_length-10,
    chunk_overlap=int(max_length/10)
)
# Due to outage
# embedder = WatsonxEmbeddings(
#     url=os.getenv("URL"),
#     apikey=os.getenv("API_KEY"),
#     project_id=os.getenv("PROJECT_ID"),
#     model_id="ibm/granite-embedding-278m-multilingual"
# ) 
from langchain_huggingface import HuggingFaceEndpointEmbeddings
model = "ibm-granite/granite-embedding-278m-multilingual"
hf = HuggingFaceEndpointEmbeddings(
    model=model,   
)
embedder = hf
vector_store = Chroma(
    collection_name="climate_edu",
    persist_directory="app/data/chroma_db",
    embedding_function=embedder
)


pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=12, device=AcceleratorDevice.AUTO
)
converter = DocumentConverter(
            format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        })


for file_path in file_paths:
    print(f"Processing file: {file_path}")
    file_path = os.path.join("app/data/climate_edu", file_path)    
    result = converter.convert(file_path)
    doc = result.document
    doc_name = doc.name
    doc_md = doc.export_to_markdown(page_break_placeholder=PAGE_BREAK, image_placeholder="",)
    pages = doc_md.split(PAGE_BREAK)
    pages = [markdown_cleanup(page) for page in pages]
    chunked_docs = []
    for i, page in enumerate(pages):
        chunks = markdown_splitter.split_text(page)
        for j, chunk in enumerate(chunks):
            print(f"processing page {i + 1} of {doc_name}")
            document = Document(
                page_content=chunk,
                embeddings=tokenizer.encode(chunk, add_special_tokens=False),
                metadata={
                    "file_name": doc_name,
                    "page": i + 1,
                    "chunk": j + 1
                }
            )
            chunked_docs.append(document)
    print("done processing file: ", file_path)
    vector_store.add_documents(chunked_docs)

Processing file: Greening curriculum guidance Teaching and learning for climate action.pdf


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  np.nanmean(
  np.nanmean(


processing page 1 of Greening curriculum guidance Teaching and learning for climate action
processing page 2 of Greening curriculum guidance Teaching and learning for climate action
processing page 2 of Greening curriculum guidance Teaching and learning for climate action
processing page 3 of Greening curriculum guidance Teaching and learning for climate action
processing page 4 of Greening curriculum guidance Teaching and learning for climate action
processing page 5 of Greening curriculum guidance Teaching and learning for climate action
processing page 5 of Greening curriculum guidance Teaching and learning for climate action
processing page 6 of Greening curriculum guidance Teaching and learning for climate action
processing page 6 of Greening curriculum guidance Teaching and learning for climate action
processing page 6 of Greening curriculum guidance Teaching and learning for climate action
processing page 7 of Greening curriculum guidance Teaching and learning for climate action

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  np.nanmean(


processing page 1 of unesco01_0
processing page 2 of unesco01_0
processing page 3 of unesco01_0
processing page 4 of unesco01_0
processing page 4 of unesco01_0
processing page 5 of unesco01_0
processing page 5 of unesco01_0
processing page 6 of unesco01_0
processing page 6 of unesco01_0
processing page 7 of unesco01_0
processing page 7 of unesco01_0
processing page 8 of unesco01_0
processing page 9 of unesco01_0
processing page 10 of unesco01_0
processing page 10 of unesco01_0
processing page 11 of unesco01_0
processing page 12 of unesco01_0
processing page 12 of unesco01_0
processing page 13 of unesco01_0
processing page 13 of unesco01_0
processing page 14 of unesco01_0
processing page 14 of unesco01_0
processing page 15 of unesco01_0
processing page 16 of unesco01_0
processing page 16 of unesco01_0
processing page 17 of unesco01_0
processing page 17 of unesco01_0
processing page 18 of unesco01_0
processing page 18 of unesco01_0
processing page 19 of unesco01_0
processing page 19 of u

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  np.nanmean(


processing page 1 of 389801eng
processing page 2 of 389801eng
processing page 3 of 389801eng
processing page 3 of 389801eng
processing page 4 of 389801eng
processing page 5 of 389801eng
processing page 6 of 389801eng
processing page 7 of 389801eng
processing page 8 of 389801eng
processing page 8 of 389801eng
processing page 9 of 389801eng
processing page 9 of 389801eng
processing page 9 of 389801eng
processing page 10 of 389801eng
processing page 11 of 389801eng
processing page 11 of 389801eng
processing page 11 of 389801eng
processing page 12 of 389801eng
processing page 12 of 389801eng
processing page 12 of 389801eng
processing page 13 of 389801eng
processing page 13 of 389801eng
processing page 13 of 389801eng
processing page 14 of 389801eng
processing page 14 of 389801eng
processing page 15 of 389801eng
processing page 15 of 389801eng
processing page 15 of 389801eng
processing page 16 of 389801eng
processing page 16 of 389801eng
processing page 16 of 389801eng
processing page 17 of

Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


processing page 32 of 389801eng
processing page 32 of 389801eng
processing page 32 of 389801eng
processing page 33 of 389801eng
processing page 33 of 389801eng
processing page 33 of 389801eng
processing page 34 of 389801eng
processing page 34 of 389801eng
processing page 34 of 389801eng
processing page 34 of 389801eng
processing page 35 of 389801eng
processing page 35 of 389801eng
processing page 35 of 389801eng
processing page 36 of 389801eng
processing page 36 of 389801eng
processing page 36 of 389801eng
processing page 37 of 389801eng
processing page 37 of 389801eng
processing page 38 of 389801eng
processing page 38 of 389801eng
processing page 39 of 389801eng
processing page 40 of 389801eng
processing page 40 of 389801eng
done processing file:  app/data/climate_edu/389801eng.pdf


In [10]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEndpointEmbeddings
model = "ibm-granite/granite-embedding-278m-multilingual"
embedder = HuggingFaceEndpointEmbeddings(
    model=model,   
)
vector_store = Chroma(
    collection_name="climate_edu",
    persist_directory="app/data/chroma_db",
    embedding_function=embedder
)


In [13]:
docs = vector_store.similarity_search_with_relevance_scores(
    "what is climate change?",
    k=5,
    score_threshold=0.65
)
docs

[(Document(id='dba63d24-542f-4408-977b-d94e4f877713', metadata={'page': 15, 'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'chunk': 1}, page_content="## Definitions of Weather, Climate, Climate Change, and Global Warming\nNOAA has provided an easy-to-understand explanation of some essential terms and relationships.\n## How is weather different from climate?\nWeather and climate describe the same thing-the state of the atmosphere-but at different time scales.\nchanges in temperature, precipitation, humidity, ocean heat, wind patterns, sea level, sea ice extent, and other factors, and how these changes affect life on Earth.\nWeather is what you experience when you step outside on any given day. In other words, it is the state of the atmosphere at a particular location over the short term. Climate is the average of the weather patterns in a location over a longer period of time, usually 30 years or more.\n## What is the difference between climate chang

In [None]:
vector_store.search(
    
)

In [70]:
vector_store = Chroma(
    collection_name="climate_edu",
    persist_directory="app/data/chroma_db",
    embedding_function=embedder
)

docs = vector_store.similarity_search_with_relevance_scores(
    "What is climate change",
    k=10,
    score_threshold=0.7
)
docs

[(Document(id='dba63d24-542f-4408-977b-d94e4f877713', metadata={'page': 15, 'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'chunk': 1}, page_content="## Definitions of Weather, Climate, Climate Change, and Global Warming\nNOAA has provided an easy-to-understand explanation of some essential terms and relationships.\n## How is weather different from climate?\nWeather and climate describe the same thing-the state of the atmosphere-but at different time scales.\nchanges in temperature, precipitation, humidity, ocean heat, wind patterns, sea level, sea ice extent, and other factors, and how these changes affect life on Earth.\nWeather is what you experience when you step outside on any given day. In other words, it is the state of the atmosphere at a particular location over the short term. Climate is the average of the weather patterns in a location over a longer period of time, usually 30 years or more.\n## What is the difference between climate chang

In [56]:
for doc in docs:
    print(doc[0].metadata)

{'chunk': 1, 'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'page': 15}
{'file_name': 'Greening curriculum guidance Teaching and learning for climate action', 'page': 12, 'chunk': 1}
{'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'page': 14, 'chunk': 1}
{'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'chunk': 1, 'page': 11}
{'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'page': 21, 'chunk': 2}
{'file_name': 'unesco01_0', 'chunk': 1, 'page': 87}
{'page': 51, 'chunk': 3, 'file_name': 'Greening curriculum guidance Teaching and learning for climate action'}


In [66]:
for i,doc in enumerate(docs):
    source_path = os.path.join("data/climate_edu", doc[0].metadata['file_name']+".pdf")
    page = doc[0].metadata['page']

In [61]:
for doc in docs:
    source_path = os.path.join("data/climate_edu", doc[0].metadata['file_name']+".pdf")
    page = doc[0].metadata['page']

In [41]:
docs[0][0].metadata['page']

15

In [46]:
docs[0][0].metadata.get("page")

15

In [None]:
(Document(id='dba63d24-542f-4408-977b-d94e4f877713', metadata={'chunk': 1, 'file_name': 'Guidelines-#7-Educating-for-Climate-Action-and-Justice-Accessible_0', 'page': 15}, page_content="## Definitions of Weather, Climate, Climate Change, and Global Warming\nNOAA has provided an easy-to-understand explanation of some essential terms and relationships.\n## How is weather different from climate?\nWeather and climate describe the same thing-the state of the atmosphere-but at different time scales.\nchanges in temperature, precipitation, humidity, ocean heat, wind patterns, sea level, sea ice extent, and other factors, and how these changes affect life on Earth.\nWeather is what you experience when you step outside on any given day. In other words, it is the state of the atmosphere at a particular location over the short term. Climate is the average of the weather patterns in a location over a longer period of time, usually 30 years or more.\n## What is the difference between climate change and global warming?\nClimate change refers to any significant change in the measure of climate for extended periods of time, usually over decades or longer. This includes major, long-term\nGlobal warming is one aspect of climate change. Specifically, it relates to the recent and ongoing rise in global average temperatures near Earth's surface (land, ocean, or both). Over the last 50 years, global warming has primarily been due to the increase of heat-trapping pollutants, called greenhouse gases, that humans are adding to the atmosphere primarily by burning fossil fuels.\nSource: NOAA. What's the difference between climate and weather? March 9, 2016. https://www.noaa.gov/explainers/ what-s-difference-between-climate-and-weather"), 0.803942680943024)


In [None]:
def export_relevant_pages(file_name, page_number):
    print("File name is:", file_name)
    print("Page number is:", page_number)
    source_path = os.path.join("data/climate_edu", file_name +".pdf")
    result = converter.convert(source_path)
    doc = result.document
    
    page_image = doc.pages[page_number-1].to_image()
    
    os.makedirs("output", exist_ok=True)
    
    output_path = f"./test{page_number}-{file_name}.png"

    page_image.save(output_path)
    return None

In [None]:
for j,i in enumerate(docs):
    if(j<1):
        export_relevant_pages((i[0].metadata['file_name']), (i[0].metadata['page']))
    