In [None]:
!pip install openai langchain faiss-cpu unstructured pdfplumber tiktoken
!pip install ragas datasets evaluate
!pip install sentence-transformers


Collecting openai
  Using cached openai-1.93.1-py3-none-any.whl.metadata (29 kB)
Collecting langchain
  Using cached langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting unstructured
  Using cached unstructured-0.18.3-py3-none-any.whl.metadata (24 kB)
Collecting pdfplumber
  Using cached pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting tiktoken
  Using cached tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.10.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting langchain-core<1.0.0,>=0.3.


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pdfplumber
import os

def extract_text_from_pdf(file_path):
    all_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            all_text += page.extract_text() + "\n"
    return all_text

medical_text = extract_text_from_pdf("medical_guideline.pdf")
print(medical_text[:1000])


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

chunks = text_splitter.split_text(medical_text)
print(chunks[:3])


In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = FAISS.from_texts(chunks, embedding=embedding_model)
retriever = vectorstore.as_retriever(search_type="similarity", k=4)


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name="gpt-4", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

def ask_medical_question(query):
    result = qa_chain(query)
    return result["result"], result["source_documents"]


In [None]:
from ragas.metrics import (
    context_precision,
    context_recall,
    faithfulness,
    answer_relevancy
)
from ragas import evaluate
from datasets import Dataset

query = "What is the recommended dosage for metformin in type 2 diabetes?"
answer, source_docs = ask_medical_question(query)
contexts = [doc.page_content for doc in source_docs]

data = {
    "question": [query],
    "contexts": [contexts],
    "answer": [answer],
    "ground_truth": ["500 mg twice daily, based on ADA guidelines"]
}

ds = Dataset.from_dict(data)

ragas_results = evaluate(
    dataset=ds,
    metrics=[context_precision, context_recall, faithfulness, answer_relevancy]
)

ragas_results.to_pandas()
