In [None]:
from langchain_chroma import Chroma
from transformers import AutoTokenizer
from langchain_ibm import ChatWatsonx, WatsonxLLM, WatsonxEmbeddings
from langchain_text_splitters import MarkdownTextSplitter
import os
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from utils import *
from langchain_core.documents import Document

PAGE_BREAK = "<!-- page break -->"

file_paths = [ file for file in os.listdir("data/climate_edu") if file.endswith(".pdf") ]
tokenizer = AutoTokenizer.from_pretrained(
    "ibm-granite/granite-embedding-278m-multilingual"
)
max_length = tokenizer.model_max_length - 10

markdown_splitter = MarkdownTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer,
    chunk_size=max_length-10,
    chunk_overlap=int(max_length/10)
)
embedder = WatsonxEmbeddings(
    url=os.getenv("URL"),
    apikey=os.getenv("API_KEY"),
    project_id=os.getenv("PROJECT_ID"),
    model_id="ibm/granite-embedding-278m-multilingual"
)
vector_store = Chroma(
    collection_name="climate_edu",
    persist_directory="data/chroma_db",
    embedding_function=embedder
)


pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=12, device=AcceleratorDevice.AUTO
)
converter = DocumentConverter(
            format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        })


for file_path in file_paths:
    print(f"Processing file: {file_path}")
    file_path = os.path.join("data/climate_edu", file_path)    
    result = converter.convert(file_path)
    doc = result.document
    doc_name = doc.name
    doc_md = doc.export_to_markdown(page_break_placeholder=PAGE_BREAK, image_placeholder="",)
    pages = doc_md.split(PAGE_BREAK)
    pages = [markdown_cleanup(page) for page in pages]
    chunked_docs = []
    for i, page in enumerate(pages):
        chunks = markdown_splitter.split_text(page)
        for j, chunk in enumerate(chunks):
            print(f"processing page {i + 1} of {doc_name}")
            document = Document(
                page_content=chunk,
                embeddings=tokenizer.encode(chunk, add_special_tokens=False),
                metadata={
                    "file_name": doc_name,
                    "page": i + 1,
                    "chunk": j + 1
                }
            )
            chunked_docs.append(document)
    print("done processing file: ", file_path)
    vector_store.add_documents(chunked_docs)