In [16]:
import ollama
from langchain_unstructured import UnstructuredLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_ollama import OllamaEmbeddings
import langchain_chroma
import langchain_community.vectorstores.utils

EMBEDDING_MODEL = 'mxbai-embed-large'

try:
    ollama.pull(EMBEDDING_MODEL)
except ollama.ResponseError as e:
    print('Error:', e.error)

embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)

collection = langchain_chroma.Chroma(
    collection_name='thesis_research',
    embedding_function=embeddings,
    persist_directory="./research_papers/embeddings.db",  # Where to save data locally, remove if not necessary
)

text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="standard_deviation")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/pull "HTTP/1.1 200 OK"


In [2]:
filepaths = [
    "research_papers/report.pdf",
    "research_papers/n11037580_cab432_finished_a3_report.pdf"
]

In [11]:
def embedd_pdfs(filepaths, collection):
    for filepath in filepaths:
        loader = UnstructuredLoader(
            file_path=filepath,
            strategy="hi_res"
        )

        documents = []
        for idx, doc in enumerate(loader.lazy_load()):
            #Make sure the document is not too long
            split_docs = text_splitter.split_documents([doc])
            documents.extend(split_docs)
            documents = langchain_community.vectorstores.utils.filter_complex_metadata(documents)
        collection.add_documents(documents, ids=[f'{filepath}:{idx}' for idx in range(len(documents))])

In [17]:
embedd_pdfs(filepaths, collection)

INFO: Reading PDF for file: research_papers/report.pdf ...
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http

In [20]:
collection.similarity_search("How can the cost be reduced in a service?")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


[Document(id='research_papers/n11037580_cab432_finished_a3_report.pdf:173', metadata={'category': 'NarrativeText', 'detection_class_prob': 0.9335010647773743, 'element_id': '11e07328f29333643ac9eed9d2a4f9be', 'file_directory': 'research_papers', 'filename': 'n11037580_cab432_finished_a3_report.pdf', 'filetype': 'application/pdf', 'last_modified': '2024-11-06T22:56:35', 'page_number': 7, 'parent_id': '4473c0bdc916f8e57b8c27de47d3b5e2', 'source': 'research_papers/n11037580_cab432_finished_a3_report.pdf'}, page_content='The cost at scale would prove prohibitive as services such as Lambda and SQS would no longer be in the free tier. Due to the larger scale it would become economically viable to move the transcoding service to an EC2 instance. At 10,000 concurrent users the load on the transcoding service should be nearly constant.'),
 Document(id='research_papers/n11037580_cab432_finished_a3_report.pdf:189', metadata={'category': 'NarrativeText', 'detection_class_prob': 0.9242711067199707,