In [None]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [2]:
load_dotenv()


True

In [3]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [6]:
pwd

'c:\\Users\\marre\\Desktop\\Kth\\ID1214\\Medical-Chatbot\\research'

In [7]:
def extract_text_from_pdfs(directory_path):
    pdf_loader = DirectoryLoader(directory_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = pdf_loader.load()
    return documents

def split_documents_into_chunks(documents, chunk_size=500, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

raw_documents = extract_text_from_pdfs("../Data")
text_chunks = split_documents_into_chunks(raw_documents)
print(f"Total text chunks: {len(text_chunks)}")

Total text chunks: 39994


In [8]:
def initialize_embeddings():
    return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

embedding_model = initialize_embeddings()

In [12]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os

pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "medichatbot"
pinecone_client.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)


In [14]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding_model
)


In [15]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [16]:
language_model = OpenAI(temperature=0.4, max_tokens=500)

template = ChatPromptTemplate.from_messages([
    ("system", (
        "You are a helpful assistant. Use the context provided to answer questions. "
        "If unsure, say so. Limit answers to 3 concise sentences.\n\n{context}"
    )),
    ("human", "{input}")
])

In [17]:
qa_chain = create_stuff_documents_chain(language_model, template)
rag_pipeline = create_retrieval_chain(retriever, qa_chain)

In [18]:
user_query = "Give me some symptoms for AIDS."
result = rag_pipeline.invoke({"input": user_query})
print(result["answer"])



Some common symptoms of AIDS include low-grade fevers, chronic fatigue, general weakness, loss of appetite, weight loss, diarrhea, mouth infections, and nerve damage. Late-stage AIDS is characterized by a low number of CD4+ lymphocytes and an increased risk of infections and other complications.


In [20]:
user_query = "I have the following symptoms, what is my disease? Shortness of breath, a high temprature, chest pain, an aching body, loss of apetite, a cough, making wheezing noises when I breathe."
result = rag_pipeline.invoke({"input": user_query})
print(result["answer"])



System: Based on your symptoms, it is possible that you have a lung infection, such as bronchitis or pneumonia. It is important to see a doctor for a proper diagnosis and treatment.
