In [2]:
import os
import glob
from dotenv import load_dotenv

load_dotenv()

MODEL = "llama3"

In [3]:
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from operator import itemgetter

# TODO: Using Memory search for now. Good idea to switch to vector database in future
from langchain_community.vectorstores import DocArrayInMemorySearch, Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader

import uuid
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain_core.documents import Document
from langchain.document_loaders import DirectoryLoader, TextLoader

from tqdm import tqdm


In [4]:
model = Ollama(model=MODEL)
ollama_embeddings = OllamaEmbeddings(model=MODEL)
parser = StrOutputParser()

# Loading Files to create vectorstore

In [5]:
pdf_loader = DirectoryLoader("../data/", glob="*/*.pdf", loader_cls=PyPDFLoader,silent_errors=True)
pdf_documents = pdf_loader.load()
txt_loader = DirectoryLoader("../data/", glob="*/*.txt", loader_cls=TextLoader,silent_errors=True)
txt_documents = txt_loader.load()
# tl = TextLoader("..\data\test\HPP Mega Set Combined via Jacob Cole and RPC.txt",encoding='UTF-8')

invalid pdf header: b'<?xml'
EOF marker not found
Error loading file ..\data\HPP Papers (PDF) Via Liezl Puzon\10.1136___bcr-2019-233350.pdf: Stream has ended unexpectedly
invalid pdf header: b'<html'
EOF marker not found
Error loading file ..\data\HPP Papers (PDF) Via Liezl Puzon\10.1136___pgmj.53.618.204.pdf: Stream has ended unexpectedly
invalid pdf header: b'<html'
EOF marker not found
Error loading file ..\data\HPP Papers (PDF) Via Liezl Puzon\10.1136___pgmj.53.622.480.pdf: Stream has ended unexpectedly
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /90ms-RKSJ-V not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /90ms-RKSJ-V not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /90ms-RKSJ-V not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /90ms-RKSJ-V not implemented yet
Advanced encoding /90ms-RKSJ-H not implemented yet
Advanced encoding /90ms-RKSJ

In [6]:
def create_and_save_vector_store(docs, persist_directory):
    vectordb = Chroma.from_documents(docs, ollama_embeddings, persist_directory=persist_directory)
    # Persist the database to disk
    vectordb.persist()
    return vectordb
def load_persisted_db(persist_directory):
    vectordb = Chroma(persist_directory=persist_directory)
    return vectordb

In [7]:
partial_docs = [*pdf_documents, *txt_documents]

# Summaries

In [23]:
some_docs = partial_docs[:10]
some_docs

[Document(page_content='Severe Cleidocranial dysplasia and Hypophosphatasia in a child\nwith microdeletion of the C-terminal region of RUNX2\nAreeg H. El-Gharbawy1,2, Joseph N. Peeden3, Ralph S. Lachman2,4, John M. Graham2,4,\nStephen R. Moore2, and David L. Rimoin2,4\n1Faculty of Medicine Cairo University, LA, California\n2Medical Genetics Institute, Cedars-Sinai Medical Center, LA, California\n3East Tennessee Children’s Hospital, Los Angeles California\n4David Geffen School of Medicine at UCLA, Los Angeles California\nAbstract\nCleidocranial dysplasia (CCD) is a rare autosomal dominant skeletal dysplasia due to mutations\ncausing haploinsufficiency of RUNX2 , an osteoblast transcription factor specific for bone and\ncartilage. The classic form of CCD is characterized by delayed closure of the fontanels,\nhypoplastic or aplastic clavicles and dental anomalies. Clinical reports suggest that a subset of\npatients with CCD have skeletal changes which mimic hypophosphatasia. Mutations in 

In [24]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | model
    | StrOutputParser()
)

summaries = chain.batch(some_docs, {"max_concurrency": 5})

In [25]:
from langchain.storage import InMemoryByteStore
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=ollama_embeddings)

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in some_docs]

# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, some_docs)))

In [26]:
query = "HPP"
sub_docs = vectorstore.similarity_search(query,k=1)
sub_docs[0]

Document(page_content="The document appears to be a figure from a medical study. Figure 1 shows an image of the patient's chest wall and back/spine at the age of 7 years. The image highlights two conditions: pectus (a congenital deformity of the breastbone) and kyphoscoliosis (a curvature of the spine).", metadata={'doc_id': 'c4f5fc13-dce5-450f-90ec-344bf92f7e80'})

In [22]:
retrieved_docs = retriever.get_relevant_documents(query,n_results=1)
retrieved_docs[0].page_content[0:500]

  warn_deprecated(
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


'are consistent features of this disorder [Otto et al., 2002; Quack et al., 1999]. Craniofacial\nfeatures include brachycephaly, delayed closure of the fontanelles and sutures, wormian\nbones, frontal and biparietal bossing, relative macrocephaly, depressed nasal bridge, midface\nhypoplasia, unerupted teeth with supernumerary permanent teeth and delayed union of the\nmandibular symphysis. Other skeletal features include short stature (mild to moderate), genu\nvalgum, delayed ossification of the pubic '