In [23]:
# Setup environment and imports
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
os.chdir('../')

print(f"Working directory: {os.getcwd()}")


Working directory: /home/macowen/Desktop/projects


In [None]:
# Import all required modules
import sys
from pathlib import Path

# Add src to path for imports
sys.path.insert(0, str(Path.cwd() / 'src'))

from src.enums import QuestionTheme, ModelType, ResponseSource
from src.models import MedicalAnswer, ThemeDetectionResponse, VectorSearchResult
from src.prompts import PromptTemplates
from src.logger import LoggerSetup
from src.vector_utils import (
    DocumentLoader, DocumentSplitter, EmbeddingManager, 
    VectorStore, VectorSearch
)
from src.model_utils import ModelManager, ThemeDetector, ResponseGenerator
from src.rag_pipeline import MedicalRAGPipeline

# Setup logging
logger = LoggerSetup.setup_logger(__name__)
logger.info("All imports successful")


In [None]:
# Display available models
print("Available Ollama Models for Medical RAG:")
print(f"- Theme Detector: {ModelType.THEME_DETECTOR.value}")
print(f"- Main Generator: {ModelType.MAIN_GENERATOR.value}")
print(f"- Embedding Model: {ModelType.EMBEDDING.value}")

print("\nAvailable Question Themes:")
for theme in QuestionTheme:
    print(f"- {theme.value}: {theme.name}")


In [2]:
%pwd

'/home/macowen/Desktop/projects/mediAi/research'

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.chdir('../')

In [2]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_pdf_docs(directory_path):
    loader = DirectoryLoader(directory_path, glob="**/*.pdf", loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

In [4]:
%pwd

'/home/macowen/Desktop/projects/mediAi'

In [5]:
extracted_data = load_pdf_docs('./data/')

In [6]:
len(extracted_data)


2510

In [7]:
from typing import List
from langchain_core.documents import Document

def split_documents(documents: List[Document], chunk_size: int = 1000, chunk_overlap:int =200) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)


def filter_short_documents(documents: List[Document]) -> List[Document]:
    filtered_docs : List[Document] = []
    for doc in documents:
        src = doc.metadata.get('source', 'Unknown')
        filtered_docs.append(
            Document(page_content=doc.page_content, metadata={'source': src, 'author':doc.metadata.get('author', 'Unknown')})
        )
    return filtered_docs

In [8]:
filtered_docs = filter_short_documents(extracted_data)
filtered_docs[0]

Document(metadata={'source': 'data/411skeletal.pdf', 'author': 'Kimberlee Hart'}, page_content='Introduction to Anatomy: The Skeletal System\nWelcome\nIntroduction\nThe Skeletal System\nShapes of Bones\nSkull Bones\nVertebrae (Spine) 1\nVertebrae (Spine) 2\nVertebrae (Spine) 3\nUpper Limb Bones\nHip (Coxal) Bones\nLower Limb Bones\nQuiz 1\nQuiz Answer Keys\nIndex: Bones by Shape\nAcknowledgements\nQuiz 2\nIntroduction to Anatomy:\nThe Skeletal System\nWelcome')

In [9]:
splitted_docs = split_documents(filtered_docs)
len(splitted_docs)


10666

In [10]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(
    model="nomic-embed-text:latest"
)

In [11]:
from pinecone import Pinecone, ServerlessSpec

# Ensure your key is loaded from .env
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "mediai-bot"

# Check if index exists
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=768,  # Adjust based on your embedding model's output dimension
        metric="cosine", # or "euclidean", "dotproduct" based on your needs
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"  # REQUIRED for Free Tier
        )
    )
    print(f"Index {index_name} created successfully.")
else:
    print(f"Index {index_name} already exists.")

Index mediai-bot already exists.


In [12]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(documents=splitted_docs, embedding=embeddings, index_name=index_name)


In [13]:
splitted_docs[0]

Document(metadata={'source': 'data/411skeletal.pdf', 'author': 'Kimberlee Hart', 'text': 'Introduction to Anatomy: The Skeletal System\nWelcome\nIntroduction\nThe Skeletal System\nShapes of Bones\nSkull Bones\nVertebrae (Spine) 1\nVertebrae (Spine) 2\nVertebrae (Spine) 3\nUpper Limb Bones\nHip (Coxal) Bones\nLower Limb Bones\nQuiz 1\nQuiz Answer Keys\nIndex: Bones by Shape\nAcknowledgements\nQuiz 2\nIntroduction to Anatomy:\nThe Skeletal System\nWelcome'}, page_content='Introduction to Anatomy: The Skeletal System\nWelcome\nIntroduction\nThe Skeletal System\nShapes of Bones\nSkull Bones\nVertebrae (Spine) 1\nVertebrae (Spine) 2\nVertebrae (Spine) 3\nUpper Limb Bones\nHip (Coxal) Bones\nLower Limb Bones\nQuiz 1\nQuiz Answer Keys\nIndex: Bones by Shape\nAcknowledgements\nQuiz 2\nIntroduction to Anatomy:\nThe Skeletal System\nWelcome')

In [16]:
# load exisring index
from langchain_pinecone import PineconeVectorStore
vectorstore= PineconeVectorStore.from_existing_index(embedding=embeddings, index_name=index_name)


In [15]:
# # store more documents
# new_docs = load_pdf_docs('./data/additional/')
# new_filtered_docs = filter_short_documents(new_docs)
# new_splitted_docs = split_documents(new_filtered_docs)
# vectorstore.add_documents(new_splitted_docs)

['4da9302f-0965-4a54-9fca-f065a364cb97',
 '30bd315b-9353-4721-86d6-949e08bf5ec8',
 '0fd3c3b7-8576-49e3-b089-772d0b77dd13',
 'd32ed83d-8c16-46fb-98a7-1385ff80a2fa',
 '3e055437-97fd-4800-a99a-6ad7512024a9',
 'eef8cc34-072d-4467-b8b4-1cc869800bf7',
 '14614fe7-deda-4764-89b1-45fa4c618508',
 'f2e7bc4b-5366-4198-9f4c-0d706d05506c',
 '3b791e52-416a-46cb-b87e-9ceedfec4721',
 'c87084ff-e067-46cd-9a0c-5b4116776b1a',
 'c23b6dd4-fab1-4d86-93b0-2392ba236b9f',
 '2ebd3618-f93a-4465-93e4-5d87d04d65f9',
 'f2ed0507-9dbd-403d-8944-367236b18511',
 'a095fceb-59e0-4dc8-a9bf-7064503d4ce1',
 '8debb9f7-d8a9-4f4f-9ddf-e5a93a6fea79',
 '63bb577a-245c-477b-a0d6-ae5a614f5f7f',
 '8840a67f-9afa-4002-a6ef-b4c72d20c963',
 '33bb5ea9-fbe9-44bb-ab88-b13cfdba7865',
 '39c8a158-0282-4203-a0f4-db298ddf3f48',
 '586e992e-94e3-4e33-8fbf-d13661c51f7b',
 'cca7da98-b26d-4497-9d37-0c3371bc4286',
 'f6180944-b842-46fb-8543-8e7fe7dd049c',
 'e21eef1b-1067-4461-a0d3-4404c5828986',
 '3f514dc1-2319-4ba9-a8ce-53f2e7684115',
 'efe93fa8-ac14-

In [17]:
retreiver = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [18]:
retreived_docs = retreiver.invoke("What is CVA or stroke")

In [21]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List
import textwrap

model = ChatOllama(model="deepseek-v3.1:671b-cloud", temperature=0.3, format="json")

In [20]:
system_prompt ="""
You are a helpful medical assistant that answers health related questions and the body anatomy and physiology. Use the context provided to give accurate and concise answers. If the context does not contain the answer, respond with "I don't know".
"""

In [None]:
class QAResponse(BaseModel):
    answer: str = Field(..., description="The answer to the user's question based on the provided context.")
    sources: List[str] = Field(..., description="A list of sources used to generate the answer.")
    context: str = Field(..., description="The context used to generate the answer.")
    question: str = Field(..., description="The question asked by the user.")
    