In [11]:
# Importing necessary libraries
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [12]:
# Load environment variables
load_dotenv()


True

In [10]:
pwd

'c:\\Users\\marre\\Desktop\\Kth\\ID1214\\AI-Healthcare-Assistant\\research'

In [13]:
# Set API keys from environment variables
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [6]:
pwd

'c:\\Users\\marre\\Desktop\\Kth\\ID1214\\Medical-Chatbot\\research'

In [None]:
# Function to extract text from PDF in a specific directory.
def extract_text_from_pdfs(directory_path):
    # Document processing - automated extraction of text from PDFs
    pdf_loader = DirectoryLoader(directory_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = pdf_loader.load()
    return documents

# Function that splits the document into smaller chunks using RecursiveCharacterTextSplitter from langchain.text_splitter
def split_documents_into_chunks(documents, chunk_size=700, chunk_overlap=30):
    # Text-chunking is done here, break large texts into semantic units for processing.
    # this Improves the context retrieval and helps the management of context window limitations
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    # Split our document into chunks
    return splitter.split_documents(documents)

# Load PDF documents from the data directory
raw_documents = extract_text_from_pdfs("../Data")
# Split documents into smaller chunks for better processing
text_chunks = split_documents_into_chunks(raw_documents)
print(f"Total text chunks: {len(text_chunks)}")

Total text chunks: 28923


In [None]:
# Initialize the embedding model
def initialize_embeddings():
    # Technique used to convert text into a numerical vector representation
    # Use huggingface's sentence transformer to create dense vector representation.
    return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

embedding_model = initialize_embeddings()

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os

pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "ai-healthcare-assistant"

# Create a pinecone index.
# Vector database, will be utilised for similarity searching.
pinecone_client.create_index(
    name=index_name,
    dimension=384, # The embedding vector has a dimension of 384.
    metric="cosine", # The metric is picked as cosine because it checks the similarity between two vectors.
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)


In [None]:
from langchain_pinecone import PineconeVectorStore

# Create a vector store from the text chunks and embeddings
# Vector indexing, organize them for efficient similarity search.
vector_store = PineconeVectorStore.from_documents(
    documents=text_chunks, # The chunks that will be stored
    index_name=index_name, # The name of the pinecone index
    embedding=embedding_model # The embedding model to use
)


In [None]:
# Create a retriever from the vector store.
# Info retrieval is done here, find the relevant documents based on similarity.
# Return the 3 most similar documents.
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [None]:
language_model = OpenAI(temperature=0.4, max_tokens=500)

template = ChatPromptTemplate.from_messages([
    ("system", (
        "You are a knowledgable AI assistant. "
        "Your task is to answer the user's question using only the information provided in the context"
        "If the answer is not found in the context, tell the user 'I can not make sure what your disease is exactly.' "
        "Do NOT make up information or provide guesses."
        "Keep responses limited to 3 clear and informative sentences, using plain language.\n\n{context}"
    )),
    ("human", "{input}")
])

In [None]:
# Create a chain that combines the retrieved documents with the user query.
# Take the retrieved content into the promp context.
qa_chain = create_stuff_documents_chain(language_model, template)

# Create a Retrieval-Augmented Generation pipeline 
# It works by combining retrival and generation.
# It ensures the LLM uses domain specific knowledge. 
rag_pipeline = create_retrieval_chain(retriever, qa_chain)

In [None]:
user_query = "What is AIDS?"
result = rag_pipeline.invoke({"input": user_query})
print(result["answer"])

In [None]:
user_query = "I have the following symptoms, what is my disease? Shortness of breath, a high temprature, chest pain, an aching body, loss of apetite, a cough, making wheezing noises when I breathe,"
result = rag_pipeline.invoke({"input": user_query})
print(result["answer"])