In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

MODEL = "llama3"

In [2]:
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from operator import itemgetter

# TODO: Using Memory search for now. Good idea to switch to vector database in future
from langchain_community.vectorstores import DocArrayInMemorySearch, Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [4]:
model = Ollama(model=MODEL)
parser = StrOutputParser()

# Loading PDFs to create vectorstore

In [5]:
import os
import glob

def get_pdf_paths(directory):
  """
  This function recursively searches a directory and its subdirectories
  for PDF files and returns a list of their full paths.
  Args:
      directory (str): The root directory to search.
  Returns:
      list: A list of full paths to all PDF files found.
  """
  pdf_paths = []
  for root, _, files in os.walk(directory):
    for filename in files:
      if filename.lower().endswith(".pdf"):  # Ensure case-insensitive matching
        pdf_paths.append(os.path.join(root, filename))
  return pdf_paths

# Specify the folder path
folder_path = r"C:\Users\TechD\Documents\Marcus\LLMs\Rarediseases2024\data"
pdfs = get_pdf_paths(folder_path)[:3]

In [6]:
embeddings = OllamaEmbeddings(model=MODEL)

In [7]:
def load_multiple_pdfs(pdf_paths):
  """
  Loads multiple PDF files into LangChain for question answering.
  Args:
      pdf_paths (list): A list of full paths to the PDF files.
  Returns:
      langchain.RetrievalQA: A LangChain model for retrieving information
          from the loaded PDFs.
  """

  documents = []
  embeddings = []

  # Load each PDF and extract text
  for path in pdf_paths:
    loader = PyPDFLoader(path)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

    # for page in pages:
    text = text_splitter.split_documents(pages)  # Split page text
    documents.append(text)  # Store document text

  # Create embeddings (replace with your preferred method)
  embedding_model = OllamaEmbeddings(model=MODEL)
  # embedding_model = OpenAIEmbeddings()  # Or other embedding model
  # embeddings = embedding_model.create_embeddings(documents)
  # vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)
  vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embedding_model)
  return vectorstore

In [8]:
# vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)
vectorstore = load_multiple_pdfs(pdfs) # TODO: embeddings and model as parameters

In [13]:
retriever = vectorstore.as_retriever()

# Creating Prompts and Chain

In [9]:
# Input template
template = """Answer the question based on the context below. If you can't answer the question, reply "I don't know".
Context: {context}
Question: {question}"""
prompt = PromptTemplate.from_template(template = template)
prompt.format(context="Here is some context", question="Here is a question")

# Generate perspectives from question
prompt_perspectives_template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = PromptTemplate.from_template(template = prompt_perspectives_template)



In [10]:
generate_queries = (
    prompt_perspectives 
    | model
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [14]:
# dependencies for multiquery
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [16]:
# Retrieve
question = "What is HPP?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})

In [17]:
final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | model
    | StrOutputParser()
)

In [18]:
questions = [
    # Input questions
    "What is HPP?",
    "Can you give me a summary of HPP?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {final_rag_chain.invoke({'question':question})}")

Question: What is HPP?
Answer: Based on the context, HPP stands for Hypophosphatasia, a genetic disorder characterized by impaired mineralization of bones and teeth.
Question: Can you give me a summary of HPP?
Answer: Based on the context provided, here is a summary of Hypophosphatasia (HPP):

* HPP is a rare genetic disorder caused by mutations in either the ALP (Akp2) or PHOSPHO1 gene.
* The main feature of HPP is impaired bone mineralization, leading to rickets and dental abnormalities.
* Two key enzymes involved in HPP are TNAP (alkaline phosphatase) and NPP1 (nucleoside triphosphate pyrophosphorylase 1).
* TNAP is essential for maintaining a proper concentration of phosphate (P) ions, which is necessary for bone mineralization.
* In the absence of TNAP, PP levels accumulate in the extracellular space, leading to HPP symptoms.
* NPP1 can act as a backup phosphatase in the absence of TNAP, but its activity is not sufficient to completely correct HPP symptoms.

Overall, HPP is charac