In [16]:
import os

# Add OpenAI library
import openai

# Get Configuration Settings
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
openai.__version__

'0.28.1'

In [18]:
# Configure OpenAI API using Azure OpenAI
openai.api_key = os.getenv("API_KEY")
openai.api_base = os.getenv("ENDPOINT")
openai.api_type = "azure"  # Necessary for using the OpenAI library with Azure OpenAI
openai.api_version = "2024-02-01"  # Latest / target version of the API

In [19]:
from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

In [20]:
# OpenAI Settings
model_deployment = "text-embedding-ada-002"
# SDK calls this "engine", but naming it "deployment_name" for clarity

model_name = "text-embedding-ada-002"

In [21]:
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
    openai_api_version = os.getenv("OPENAI_API_VERSION"), openai_api_key = os.getenv("API_KEY"),
    openai_api_base = os.getenv("ENDPOINT"), openai_api_type = "azure"
)

## Add items to vector store

In [22]:
from langchain_chroma import Chroma

vector_store_ellipsis = Chroma(
    collection_name="Ellipsis-Care-Docs",
    embedding_function=openai_embeddings,
    persist_directory="/Users/mac/Documents/Ellipsis-Care/Ellipsis-Care-Chroma-Vector-DB",  # Where to save data locally, remove if not neccesary
)

In [62]:
import zipfile, pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

def upsert_pdf_content(file:str) -> Exception:
        """
        This method is responsible for upserting PDF content.
        It loads the PDF file, splits the content into chunks, and then upserts the chunks into VecDB.
        """
        loader = PyPDFLoader(file)
        data = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200, separators=["\n\n", "\n", "(?<=\. )", " ", ""])
        docs = text_splitter.split_documents(data)
        # print(docs)
        return docs

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

def load_and_process_pdfs(pdf_folder_path):
        """
        This method is responsible for upserting PDF content.
        It loads the PDF file, splits the content into chunks, and then upserts the chunks into VecDB.
        """
        documents = []
        for file in os.listdir(pdf_folder_path):
            if file.endswith(".pdf"):
                pdf_path = os.path.join(pdf_folder_path, file)
                loader = PyPDFLoader(pdf_path)
                documents.extend(loader.load())
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
        docs = text_splitter.split_documents(documents)
        return docs

In [8]:
pdf_folder_path =  "/Users/mac/Documents/Ellipsis-Care/data"
splits = load_and_process_pdfs(pdf_folder_path)

In [9]:
print(splits[30].page_content)

awareness for the problem of chronic diseases? Is it to build support for the development of an action framework to prevent and control chronic diseases? Is it to promote the already developed action framework to prevent and control chronic diseases? It is important to identify specifically a goal and objectives for the advocating campaign. Establish a clear long-term goal and SMART (specific, measurable, achievable, relevant and timely) objectives at the beginning of your advocacy work.   Step 2: Identify target audiences In advocacy work, the two main audiences will usually be decision-makers and influencers. The more specific you are in identifying the audience, the more effective your communications will be.   A. Potential decision-makers: B. Potential influencers: 1. Government (ministries and parliament): • presidents and prime ministers • health ministers and their deputies • budgetary decision makers (e.g. ministers of finance) • ministers of related sectors and their deputies


In [10]:
splits[0]

Document(metadata={'source': '/Users/mac/Documents/Ellipsis-Care/data/Action Framework for the Prevention and Control of Chronic Disease.pdf', 'page': 0}, page_content='1  Draft version 4, 3 August 2006.          WHO Action Framework  for the  Prevention and Control  of  Chronic Diseases    - Core package -')

In [11]:
# for i in range(100):
#     print(splits[i])
#     print(i)
#     print("\n\n")

In [56]:
# uuids = [str(uuid4()) for _ in range(len(splits))]

In [57]:
len(splits)

2227

In [11]:
# from uuid import uuid4
# import time

# delay = 70  # Delay in seconds between batches

# for file in os.listdir(pdf_folder_path):
#     if file.endswith(".pdf"):
#         pdf_path = os.path.join(pdf_folder_path, file)
#         docs = upsert_pdf_content(pdf_path)
#         for chunk in docs:
#             doc_id = str(uuid4())
#             vector_store_ellipsis.add_documents(documents=[chunk], ids=[doc_id])
#             time.sleep(delay)

In [14]:
import time
from uuid import uuid4

batch_size = 100  # Adjust this batch size based on your rate limit
delay = 70  # Delay in seconds between batches

for i in range(0, len(splits), batch_size):
    batch = splits[i:i+batch_size]
    uuids = [str(uuid4()) for _ in range(len(batch))]
    print(f"Upserting {i} documents")
    # try:
    response = vector_store_ellipsis.add_documents(documents=batch, ids=uuids)
    #     print(f"Response: {response}")
    # except Exception as e:
    #     print(e)
    time.sleep(delay)  # Delay to prevent hitting rate limits


Upserting 0 documents


Upserting 100 documents
Upserting 200 documents
Upserting 300 documents
Upserting 400 documents
Upserting 500 documents
Upserting 600 documents
Upserting 700 documents
Upserting 800 documents
Upserting 900 documents
Upserting 1000 documents
Upserting 1100 documents
Upserting 1200 documents
Upserting 1300 documents
Upserting 1400 documents
Upserting 1500 documents
Upserting 1600 documents
Upserting 1700 documents


Retrying langchain_community.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


Upserting 1800 documents
Upserting 1900 documents
Upserting 2000 documents
Upserting 2100 documents
Upserting 2200 documents


## Testing RAG with Chroma DB

In [23]:
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory, ConversationBufferWindowMemory
from langchain import PromptTemplate

In [24]:
template = """
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
------
<ctx>
{context}
</ctx>
------
<hs>
{history}
</hs>
------
{question}
Answer:
"""
prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=template,
)

In [25]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.llms import OpenAI

In [26]:
llm = ChatOpenAI(temperature = 0.6, openai_api_key = os.getenv("API_KEY"), openai_api_base = os.getenv("ENDPOINT"), model_name="gpt-35-turbo", engine="Voicetask")

                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


In [27]:
retriever = vector_store_ellipsis.as_retriever(search_kwargs={'k': 5,})

In [28]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": ConversationBufferWindowMemory(
            k = 10,
            memory_key="history",
            input_key="question"),
            }
    )

In [29]:
import langchain
langchain.verbose = True

In [30]:
query = "What are the major causees of cervical cancer??"
# response = qa_stuff.run(query)
print(qa_stuff.run(query))



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
------
<ctx>
worsensAssess likelihood for cervical cancer
 ■Assess signs and symptoms (i.e. history, intensity, duration, progression)
 ■Identify relevant risk factors: age (30 years old and above)
 ■Speculum examination
 ■Differential diagnosis: abortion in pre-menopausal women, infections (e.g. Chlamydiae, 
gonococcal, etc.), genital ulcers, cervical inflammation, uterine polyps, dysfunctional uterus 
hemorrhage, endometrial or vaginal cancerWomen who present the following persistent and unexplained signs and symptoms should seek consultation at a PHC:
a) Abnormal vaginal bleeding (i.e.