# RAG testing

In [1]:
import streamlit as st

# langchain
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.chains import MultiRetrievalQAChain

# streaming
from langchain_community.callbacks import StreamlitCallbackHandler

# RAG
from langchain_community.document_loaders import PyPDFDirectoryLoader # or use Unstructured - UnstructuredPDFLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# os
import os

from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# Helper Functions

In [2]:
def get_loaders(use_patient=True, patient="p1", use_kb=True, kb="cardiovascular"):
    loaders = []
    if use_patient:
        patient_loader = PyPDFDirectoryLoader(f"../patients/{patient}/")
        loaders.append(patient_loader)
    if use_kb:
        kb_loader = PyPDFDirectoryLoader(f"../knowledge_bases/{kb}/")
        loaders.append(kb_loader)

    return loaders

In [2]:
def get_retrievers(use_patient=True, patient="p1", use_kb=True, kb="cardiovascular"):
    retrievers = []

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)

    embeddings = HuggingFaceEmbeddings(
        model_name="thenlper/gte-large",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True},
    )

    if use_patient:
        patient_loader = PyPDFDirectoryLoader(f"../patients/{patient}/")
        documents = patient_loader.load()
        texts = text_splitter.split_documents(documents)
        pat_db = FAISS.from_documents(texts, embeddings)
        retrievers.append({
            "name": "Patient Chart",
            "description": "Good for answering questions about patient-specific data",
            "retriever": pat_db.as_retriever()
        })

    if use_kb:
        kb_loader = PyPDFDirectoryLoader(f"../knowledge_bases/{kb}/")
        documents = kb_loader.load()
        texts = text_splitter.split_documents(documents)
        kb_db = FAISS.from_documents(texts, embeddings)
        retrievers.append({
            "name": "Knowledge Base",
            "description": "Good for answering questions about disease/diagnosis specific information from knowledge base",
            "retriever": kb_db.as_retriever()
        })

    if len(retrievers) == 0:
        return None
    
    return retrievers

In [3]:
def get_llm(model_selected="gpt-3.5-turbo"):
    client = ChatOpenAI(model_name=model_selected,
                        temperature=1.0,
                        openai_api_key=openai_api_key)
    
    return client

# Testing Implementations

In [4]:
retrievers = get_retrievers(use_kb=False, use_patient=True)

llm = get_llm()

qa = MultiRetrievalQAChain.from_retrievers(
    llm=llm,
    retriever_infos=retrievers
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
qa.invoke("What can you tell me about the patient?")



{'input': 'What can you tell me about the patient?',
 'query': 'What information are you looking for about the patient?',
 'result': "The user is asking for information about the patient's medical conditions, medications, and plan for treatment."}

In [31]:
messages = [{"role": "assistant", "content": "How may I help you?"},
            {"role": "user", "content": "What can you tell me about the current patient? Main concerns, diagnoses, etc.?"}]

def st_messages_to_lc_messages(st_messages):
    lc_messages = []
    for message in st_messages:
        if message["role"] == "user":
            lc_messages.append(
                HumanMessage(content=message["content"])
            )
        elif message["role"] == "assistant":
            lc_messages.append(
                AIMessage(content=message["content"])
            )
        elif message["role"] == "system":
            lc_messages.append(
                SystemMessage(content=message["content"])
            )

    return lc_messages

In [32]:
st_messages_to_lc_messages(messages)

[AIMessage(content='How may I help you?'),
 HumanMessage(content='What is the capital of France?')]

In [35]:
response = qa.invoke(messages)



In [37]:
response['result']

'The capital of France is Paris. It is a beautiful city known for its rich history, stunning architecture, and vibrant culture. Paris is home to iconic landmarks such as the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral. It is also considered a global center for art, fashion, and cuisine.'

In [40]:
response = llm.invoke(messages)

In [45]:
response.content

'The capital of France is Paris.'

: 

In [3]:
loaders = get_loaders(use_patient=True, use_kb=False)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)

embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

print(len(loaders))

index = VectorstoreIndexCreator(
    vectorstore_cls=FAISS,
    embedding=embeddings,
    text_splitter=text_splitter
).from_loaders(loaders)

  from .autonotebook import tqdm as notebook_tqdm


1


In [5]:
index.query("chief complaint")

" The chief complaint is the main reason for the patient's visit, which in this case is chest pain on exertion, shortness of breath, and occasional palpitations."

In [7]:
llm = get_llm()

In [9]:
qa = RetrievalQA(llm=llm, chain_type="stuff", retriever=index)

ValidationError: 4 validation errors for RetrievalQA
combine_documents_chain
  field required (type=value_error.missing)
retriever
  Can't instantiate abstract class BaseRetriever with abstract method _get_relevant_documents (type=type_error)
chain_type
  extra fields not permitted (type=value_error.extra)
llm
  extra fields not permitted (type=value_error.extra)

In [11]:
help(index)

Help on VectorStoreIndexWrapper in module langchain.indexes.vectorstore object:

class VectorStoreIndexWrapper(pydantic.v1.main.BaseModel)
 |  VectorStoreIndexWrapper(*, vectorstore: langchain_core.vectorstores.VectorStore) -> None
 |  
 |  Wrapper around a vectorstore for easy access.
 |  
 |  Method resolution order:
 |      VectorStoreIndexWrapper
 |      pydantic.v1.main.BaseModel
 |      pydantic.v1.utils.Representation
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  query(self, question: str, llm: Optional[langchain_core.language_models.base.BaseLanguageModel] = None, retriever_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str
 |      Query the vectorstore.
 |  
 |  query_with_sources(self, question: str, llm: Optional[langchain_core.language_models.base.BaseLanguageModel] = None, retriever_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any) -> dict
 |      Query the vectorstore and get back sources.
 |  
 |  --------------------------------------

In [12]:
help(PyPDFDirectoryLoader)

Help on class PyPDFDirectoryLoader in module langchain_community.document_loaders.pdf:

class PyPDFDirectoryLoader(langchain_community.document_loaders.base.BaseLoader)
 |  PyPDFDirectoryLoader(path: str, glob: str = '**/[!.]*.pdf', silent_errors: bool = False, load_hidden: bool = False, recursive: bool = False, extract_images: bool = False)
 |  
 |  Load a directory with `PDF` files using `pypdf` and chunks at character level.
 |  
 |  Loader also stores page numbers in metadata.
 |  
 |  Method resolution order:
 |      PyPDFDirectoryLoader
 |      langchain_community.document_loaders.base.BaseLoader
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path: str, glob: str = '**/[!.]*.pdf', silent_errors: bool = False, load_hidden: bool = False, recursive: bool = False, extract_images: bool = False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  load(self) -> List[langchain_core.documents.base.Document]
 |    

# streamlit testing

In [4]:
import streamlit as st
from langchain.llms import OpenAI

st.title('🦜🔗 Quickstart App')

openai_api_key = st.sidebar.text_input('OpenAI API Key', type='password')

def generate_response(input_text):
    llm = OpenAI(temperature=0.7, openai_api_key=openai_api_key)
    st.info(llm(input_text))

with st.form('my_form'):
    text = st.text_area('Enter text:', 'What are the three key pieces of advice for learning how to code?')
    submitted = st.form_submit_button('Submit')
    if not openai_api_key.startswith('sk-'):
        st.warning('Please enter your OpenAI API key!', icon='⚠')
    if submitted and openai_api_key.startswith('sk-'):
        generate_response(text)

2024-02-07 12:11:18.637 
  command:

    streamlit run C:\Users\kolton.hauck\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py [ARGUMENTS]
