In [4]:
import os

# Add OpenAI library
import openai

# Get Configuration Settings
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
openai.__version__

'0.28.1'

### Extracting PDF content with Azure Document Intelligence

In [6]:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

In [27]:
def extract_contents_from_doc(files, dir="../extracted_data"):
    """
    Azure Document Intelligence
    Args: 
        files (uploaded by the user): List of uploaded files to process.
        temp_dir (str): Directory path to store the extracted contents.
    
    Returns: 
        List of file paths where the extracted content is stored.
    """
    # Constants for Azure Document Intelligence
    DI_ENDPOINT = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
    DOCUMENT_INTELLIGENCE_KEY = os.getenv('DOCUMENT_INTELLIGENCE_SUBSCRIPTION_KEY')

    if not DI_ENDPOINT or not DOCUMENT_INTELLIGENCE_KEY:
        return []

    document_intelligence_client = DocumentAnalysisClient(
        endpoint=DI_ENDPOINT,
        credential=AzureKeyCredential(DOCUMENT_INTELLIGENCE_KEY)
    )

    # Ensure the temporary directory exists
    # os.makedirs(dir, exist_ok=True)

    extracted_file_paths = []

    for file in files:
        try:
            # Read file content
            file_content = file.read()
                
            # Perform content extraction using Azure's "prebuilt-read" model
            extract = document_intelligence_client.begin_analyze_document("prebuilt-read", file_content)
            result = extract.result()

            # Extract content from each page
            extracted_content = ""
            for page in result.pages:
                for line in page.lines:
                    extracted_content += line.content + "\n"
            
            # Secure the filename and define a path for saving extracted content
            # filename = secure_filename(file.name)
            filename = file.name
            base, ext = os.path.splitext(filename)
            # print("base: ", base)
            # print("ext: ", ext)
            extracted_filename = f"{base.split('/')[-1]}_extracted.txt"  # Save as .txt for easier reading
            file_path = os.path.join(dir, extracted_filename)

            # Save the extracted content to a file
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(extracted_content)
            
            extracted_file_paths.append(file_path)

        except Exception as e:
            continue  # Proceed with the next file in case of an error

    return extracted_file_paths

In [28]:
pdf_folder_path =  "../data"
for file in os.listdir(pdf_folder_path):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, file)
        path1 = extract_contents_from_doc([open(pdf_path, "rb")])
        print(path1[0])
        

../extracted_data/Action Framework for the Prevention and Control of Chronic Disease_extracted.txt
../extracted_data/WHO_Package of Essential Noncommunicable (PEN) disease interventions for primary health care in low-resou_extracted.txt
../extracted_data/WHO_DIET, NUTRITION AND THE PREVENTION OF CHRONIC DISEASES_extracted.txt
../extracted_data/WHO model list of essential medicines_extracted.txt
../extracted_data/GUIDELINES FOR THE PREVENTION, CARE AND TREATMENT OF PERSONS WITH CHRONIC HEPATITIS B INFECTION_extracted.txt


### Vector Store Setup

In [29]:
# Configure OpenAI API using Azure OpenAI
openai.api_key = os.getenv("API_KEY")
openai.api_base = os.getenv("ENDPOINT")
openai.api_type = "azure"  # Necessary for using the OpenAI library with Azure OpenAI
openai.api_version = "2024-02-01"  # Latest / target version of the API

In [30]:
from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

In [31]:
# OpenAI Settings
model_deployment = "text-embedding-ada-002"
# SDK calls this "engine", but naming it "deployment_name" for clarity

model_name = "text-embedding-ada-002"

In [32]:
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
    openai_api_version = os.getenv("OPENAI_API_VERSION"), openai_api_key = os.getenv("API_KEY"),
    openai_api_base = os.getenv("ENDPOINT"), openai_api_type = "azure"
)

  warn_deprecated(


## Add items to vector store

In [33]:
from langchain_chroma import Chroma

vector_store_ellipsis = Chroma(
    collection_name="Ellipsis-Care-Docs",
    embedding_function=openai_embeddings,
    persist_directory="../Ellipsis-Care-Chroma-Vector-DB",  # Where to save data locally, remove if not neccesary
)

In [34]:
import zipfile, pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

def upsert_pdf_content(file:str) -> Exception:
        """
        This method is responsible for upserting PDF content.
        It loads the PDF file, splits the content into chunks, and then upserts the chunks into VecDB.
        """
        loader = PyPDFLoader(file)
        data = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200, separators=["\n\n", "\n", "(?<=\. )", " ", ""])
        docs = text_splitter.split_documents(data)
        # print(docs)
        return docs

In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader

def load_and_process_txts(txt_folder_path):
        """
        This method is responsible for upserting TXT content.
        It loads the TXT file, splits the content into chunks, and then upserts the chunks into VecDB.
        """
        documents = []
        for file in os.listdir(txt_folder_path):
            if file.endswith(".txt"):
                txt_path = os.path.join(txt_folder_path, file)
                loader = TextLoader(txt_path)
                documents.extend(loader.load())
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300, separators=["\n", " ", "?", ".", "!"])
        docs = text_splitter.split_documents(documents)
        return docs

In [41]:
txt_folder_path =  "../extracted_data"
splits = load_and_process_txts(txt_folder_path)

In [42]:
len(splits)

2096

In [43]:
print(splits[30].page_content)

U/L for women, although local laboratory normal ranges should be
applied. Persistently abnormal or normal may be defined as three
ALT determinations above or below the upper limit of normal,
made at unspecified intervals during a 6-12-month period or
predefined intervals during a 12-month period.
ASSESSMENT OF LIVER FIBROSIS BY NON-INVASIVE TESTS
APRI
FIB-4
FibroTest (FibroSure)
Commercial biomarker test that uses the results of six blood
markers to estimate hepatic fibrosis
Transient elastography
(FibroScan)
Aspartate aminotransferase (AST)-to-platelet ratio index (APRI) is
a simple index for estimating hepatic fibrosis based on a formula
derived from AST and platelet concentrations.
A formula for calculating the APRI is given: APRI = * (AST/ULN) x
100) / platelet count (109/L). An online calculator can be found at:
http://www.hepatitisc.uw.edu/page/clinical-calculators/apri
A simple index for estimating hepatic fibrosis based on a calculation


In [44]:
splits[30]

Document(metadata={'source': '../extracted_data/GUIDELINES FOR THE PREVENTION, CARE AND TREATMENT OF PERSONS WITH CHRONIC HEPATITIS B INFECTION_extracted.txt'}, page_content='U/L for women, although local laboratory normal ranges should be\napplied. Persistently abnormal or normal may be defined as three\nALT determinations above or below the upper limit of normal,\nmade at unspecified intervals during a 6-12-month period or\npredefined intervals during a 12-month period.\nASSESSMENT OF LIVER FIBROSIS BY NON-INVASIVE TESTS\nAPRI\nFIB-4\nFibroTest (FibroSure)\nCommercial biomarker test that uses the results of six blood\nmarkers to estimate hepatic fibrosis\nTransient elastography\n(FibroScan)\nAspartate aminotransferase (AST)-to-platelet ratio index (APRI) is\na simple index for estimating hepatic fibrosis based on a formula\nderived from AST and platelet concentrations.\nA formula for calculating the APRI is given: APRI = * (AST/ULN) x\n100) / platelet count (109/L). An online calcula

In [11]:
# for i in range(100):
#     print(splits[i])
#     print(i)
#     print("\n\n")

In [56]:
# uuids = [str(uuid4()) for _ in range(len(splits))]

In [45]:
len(splits)

2096

In [11]:
# from uuid import uuid4
# import time

# delay = 70  # Delay in seconds between batches

# for file in os.listdir(pdf_folder_path):
#     if file.endswith(".pdf"):
#         pdf_path = os.path.join(pdf_folder_path, file)
#         docs = upsert_pdf_content(pdf_path)
#         for chunk in docs:
#             doc_id = str(uuid4())
#             vector_store_ellipsis.add_documents(documents=[chunk], ids=[doc_id])
#             time.sleep(delay)

In [46]:
import time
from uuid import uuid4

batch_size = 100  # Adjust this batch size based on your rate limit
delay = 70  # Delay in seconds between batches

for i in range(0, len(splits), batch_size):
    batch = splits[i:i+batch_size]
    uuids = [str(uuid4()) for _ in range(len(batch))]
    print(f"Upserting {i} documents")
    # try:
    response = vector_store_ellipsis.add_documents(documents=batch, ids=uuids)
    #     print(f"Response: {response}")
    # except Exception as e:
    #     print(e)
    time.sleep(delay)  # Delay to prevent hitting rate limits


Upserting 0 documents
Upserting 100 documents
Upserting 200 documents
Upserting 300 documents
Upserting 400 documents
Upserting 500 documents
Upserting 600 documents
Upserting 700 documents
Upserting 800 documents
Upserting 900 documents
Upserting 1000 documents
Upserting 1100 documents
Upserting 1200 documents
Upserting 1300 documents
Upserting 1400 documents
Upserting 1500 documents
Upserting 1600 documents
Upserting 1700 documents
Upserting 1800 documents
Upserting 1900 documents
Upserting 2000 documents


## Testing RAG with Chroma DB

In [47]:
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory, ConversationBufferWindowMemory
from langchain import PromptTemplate

In [48]:
template = """
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
------
<ctx>
{context}
</ctx>
------
<hs>
{history}
</hs>
------
{question}
Answer:
"""
prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=template,
)

In [49]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.llms import OpenAI

In [50]:
llm = ChatOpenAI(temperature = 0.6, openai_api_key = os.getenv("API_KEY"), openai_api_base = os.getenv("ENDPOINT"), model_name="gpt-35-turbo", engine="Voicetask")

  warn_deprecated(
                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


In [51]:
retriever = vector_store_ellipsis.as_retriever(search_kwargs={'k': 5,})

In [52]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": ConversationBufferWindowMemory(
            k = 10,
            memory_key="history",
            input_key="question"),
            }
    )

In [53]:
import langchain
langchain.verbose = True

In [54]:
query = "What are the major causees of cervical cancer??"
# response = qa_stuff.run(query)
print(qa_stuff.run(query))

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
------
<ctx>
signs of cervical cancer (Shapley et al., 2006; Sarkar et al., 2010;
Ikechebelu et al., 2010). These signs may be associated with early
stages of invasive cervical cancer, particularly in women above the
age of 30 years. However, abnormal vaginal bleeding in sexually active
women is more frequently caused by abortion (in pre-menopausal
women) and benign conditions such as cervical infections (including
gonorrhoea and chlamydiae) ulceration due to cervical inflammatory
disease, uterine polyps, and dysfunctional uterine bleeding due to
hormonal imbalance. Similarly, persistent,