In [22]:
import os
import glob
from dotenv import load_dotenv

load_dotenv()

MODEL = "llama3"

In [23]:
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from operator import itemgetter

# TODO: Using Memory search for now. Good idea to switch to vector database in future
from langchain_community.vectorstores import DocArrayInMemorySearch, Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader


In [36]:
model = Ollama(model=MODEL)
ollama_embeddings = OllamaEmbeddings(model=MODEL)
parser = StrOutputParser()

# Langsmith

In [9]:
# from langsmith import Client
# os.environ['LANGCHAIN_TRACING_V2'] = 'true' # enables tracing 
# os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
# os.environ['LANGCHAIN_PROJECT'] = 'Test'
# client = Client()

In [10]:
# model.invoke(f"hello world")
# https://smith.langchain.com/o/a2673a5a-673b-57f0-a982-89ff7985f78f/projects/p/65623a8d-8bd5-40f6-9278-25a5bf53a021?timeModel=%7B%22duration%22%3A%227d%22%7D

# Loading PDFs to create vectorstore

In [44]:
def num_tokens_from_string(model, string: str) -> int:
    """Returns the number of tokens in a text string."""
    return model.get_num_tokens(string)

In [34]:
f = r"C:\Users\TechD\Documents\Marcus\LLMs\Rarediseases2024\data\HPP Papers (PDF) Via Liezl Puzon\10.1002___ajmg.a.33146.pdf"
loader = PyPDFLoader(f)
docs = loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
# chunks = text_splitter.split_text(docs)
RecursiveCharacterTextSplitter
docs_ = []
docs_.extend([*docs, *docs])
doc_text = [d.page_content for d in docs_]
doc_text

['Severe Cleidocranial dysplasia and Hypophosphatasia in a child\nwith microdeletion of the C-terminal region of RUNX2\nAreeg H. El-Gharbawy1,2, Joseph N. Peeden3, Ralph S. Lachman2,4, John M. Graham2,4,\nStephen R. Moore2, and David L. Rimoin2,4\n1Faculty of Medicine Cairo University, LA, California\n2Medical Genetics Institute, Cedars-Sinai Medical Center, LA, California\n3East Tennessee Children’s Hospital, Los Angeles California\n4David Geffen School of Medicine at UCLA, Los Angeles California\nAbstract\nCleidocranial dysplasia (CCD) is a rare autosomal dominant skeletal dysplasia due to mutations\ncausing haploinsufficiency of RUNX2 , an osteoblast transcription factor specific for bone and\ncartilage. The classic form of CCD is characterized by delayed closure of the fontanels,\nhypoplastic or aplastic clavicles and dental anomalies. Clinical reports suggest that a subset of\npatients with CCD have skeletal changes which mimic hypophosphatasia. Mutations in RUNX2\nare detected in

In [25]:
def is_valid_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        reader.pages[0]  # Try accessing the first page
        return True
    except Exception:
        return False

def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    return loader.load()

def process_pdfs(pdf_paths):
    all_chunks = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    for path in pdf_paths:
        if is_valid_pdf(path):
            documents = load_pdf(path)
            chunks = text_splitter.split_text(documents)
            all_chunks.extend(chunks)
        else:
            print(f"Invalid PDF file: {path}")
    return all_chunks
def get_pdf_paths(directory):
  """  This function recursively searches a directory and its subdirectories for PDF files and returns a list of their full paths."""
  pdf_paths = []
  for root, _, files in os.walk(directory):
    for filename in files:
      if filename.lower().endswith(".pdf"):  # Ensure case-insensitive matching
        pdf_paths.append(os.path.join(root, filename))
  return pdf_paths

In [26]:
# Directory containing PDFs
pdf_directory = r"C:\Users\TechD\Documents\Marcus\LLMs\Rarediseases2024\data"
pdf_paths = get_pdf_paths(pdf_directory)
pdf_chunks = process_pdfs(pdf_paths)

TypeError: expected string or bytes-like object

In [35]:
# Print a few chunks to verify
for chunk in pdf_chunks[:1]:
    print(chunk)

Severe Cleidocranial dysplasia and Hypophosphatasia in a child
with microdeletion of the C-terminal region of RUNX2
Areeg H. El-Gharbawy1,2, Joseph N. Peeden3, Ralph S. Lachman2,4, John M. Graham2,4,
Stephen R. Moore2, and David L. Rimoin2,4
1Faculty of Medicine Cairo University, LA, California


In [12]:
def load_multiple_pdfs(pdf_paths):
  """
  Loads multiple PDF files into LangChain for question answering.
  Args:
      pdf_paths (list): A list of full paths to the PDF files.
  Returns:
      langchain.RetrievalQA: A LangChain model for retrieving information
          from the loaded PDFs.
  """

  documents = []
  embeddings = []

  # Load each PDF and extract text
  for path in pdf_paths:
    loader = PyPDFLoader(path)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

    # for page in pages:
    text = text_splitter.split_documents(pages)  # Split page text
    documents.append(text)  # Store document text
  return documents

  # Create embeddings (replace with your preferred method)
  ollama_embeddings = OllamaEmbeddings(model=MODEL)
  vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=ollama_embeddings)
  return vectorstore
# vectorstore = load_multiple_pdfs(pdfs) # TODO: embeddings and model as parameters

In [13]:
# vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)
documents = load_multiple_pdfs(pdfs)
documents[:1]

invalid pdf header: b'<?xml'
EOF marker not found


PdfStreamError: Stream has ended unexpectedly

# Using pinecone

In [None]:
import logging
import sys
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from pinecone import Pinecone, ServerlessSpec
pc_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pc_api_key)
pc.create_index(
    name="rarediseases2024",
    dimension=4096,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [None]:

retriever = vectorstore.as_retriever()# Using pinecone

# Creating Prompts and Chain

In [None]:
# Input template
template = """Answer the question based on the context below. If you can't answer the question, reply "I don't know".
Context: {context}
Question: {question}"""
prompt = PromptTemplate.from_template(template = template)
prompt.format(context="Here is some context", question="Here is a question")

# Generate perspectives from question
prompt_perspectives_template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = PromptTemplate.from_template(template = prompt_perspectives_template)



In [None]:
generate_queries = (
    prompt_perspectives 
    | model
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
# dependencies for multiquery
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [None]:
# Retrieve
question = "What is HPP?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})

In [None]:
final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
questions = [
    # Input questions
    "What is HPP?",
    "Can you give me a summary of HPP?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {final_rag_chain.invoke({'question':question})}")