In [None]:
import re
from pprint import pprint
import os
from dotenv import load_dotenv

load_dotenv("../config.env")
os.environ.get("OPENAI_API_KEY")

In [None]:
# read in IDSR.txt
with open("IDSR.txt", encoding="utf-8") as f:
    text = f.read()

Extract Keywords

In [None]:
prompt = """
You are a helpful assistant. Extract a list of 30–50 key symptoms, signs, or diagnostic terms from the following disease descriptions.

Focus on words or phrases that are likely to appear in clinical case definitions or user queries — such as "fever", "skin lesions", "swollen lymph nodes", "positive blood smear", etc.

Only return the keywords or short phrases — one per line.

Text:
"""

In [None]:
from openai import OpenAI

client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt + text}
    ],
    temperature=0.0
)
keywords = [line.strip() for line in response.choices[0].message.content.splitlines() if line.strip()]
print("Extracted Keywords:")
for keyword in keywords:
    print("-", keyword)

In [None]:
# remove dashes and normalize keywords
def normalize_kw(kw):
    return kw.lstrip("-• ").strip().lower() 
keywords = [normalize_kw(kw) for kw in keywords]

In [None]:
# save keywords to file
with open("idsr_keywords.txt", "w", encoding="utf-8") as f:
    for keyword in keywords:
        f.write(f"{keyword}\n")

In [None]:
# load file
with open("idsr_keywords.txt", "r", encoding="utf-8") as f:
    keywords = [line.strip() for line in f if line.strip()]

Prep each disease as a document

In [None]:
# we need to split the text into a list of dictionaries:
# the text is structured as follows:
# the section for each disease starts after an empty line.
# the disease name itself takes up the first line.
# following the disease name, there will be subsections, each one beginning with an "-", some text, and then a colon. 
# what is between the "-" and the colon is the name of the subsection. the name of each subsection takes up one line.
# following this, the next few lines contains the text for that subsection. however many lines it takes up,
# this should be the value for the subsection key in the dictionary, condenses to a single string.
# some diseases have multiple subsections, while others have only one.
# when we encounter an empty line, it indicates the start of a new disease section.
# what we should produce is one dictionary per disease, with a key called disease_name and value being the name of the disease. 
# the other keys should be the subsections, with the value being the text that follows the subsection name.

def parse_disease_text(text):
    diseases = []
    lines = text.strip().splitlines()
    
    current_disease = None
    current_subsection = None
    buffer = []

    def finalize_subsection():
        if current_disease is not None and current_subsection and buffer:
            content = " ".join(line.strip() for line in buffer).strip()
            current_disease[current_subsection] = content

    subsection_pattern = re.compile(r"^-\s*(.+):\s*$")

    for line in lines + [""]:  # Extra empty line to trigger final save
        if not line.strip():
            finalize_subsection()
            if current_disease:
                diseases.append(current_disease)
            current_disease = None
            current_subsection = None
            buffer = []
            continue

        if current_disease is None:
            current_disease = {"disease_name": line.strip()}
            continue

        match = subsection_pattern.match(line)
        if match:
            finalize_subsection()
            current_subsection = match.group(1).strip()
            buffer = []
        else:
            buffer.append(line.rstrip())

    return diseases



disease_dicts = parse_disease_text(text)
   

In [None]:
from langchain_core.documents import Document

def convert_disease_dicts_to_documents(disease_dicts):
    docs = []
    for disease in disease_dicts:
        disease_name = disease.get("disease_name", "")
        subsections = [f"{key}:\n{value}" for key, value in disease.items() if key != "disease_name"]
        full_text = f"Disease: {disease_name}\n\n" + "\n\n".join(subsections)
        docs.append(Document(page_content=full_text, metadata={"disease_name": disease_name}))
    return docs


In [None]:
# Step 2: Convert to LangChain documents
documents = convert_disease_dicts_to_documents(disease_dicts)

Tag each document with keywords

In [None]:
from rapidfuzz import fuzz

def tag_documents_with_keywords(documents, keywords, threshold=85):
    """
    Tags each Document in the list with a 'matched_keywords' metadata field
    using fuzzy matching (e.g., RapidFuzz partial ratio).

    Parameters:
        documents (list): List of langchain `Document` objects.
        keywords (list): List of predefined clinical keywords (e.g. from GPT).
        threshold (int): Similarity threshold (0–100) for fuzzy matching.

    Returns:
        List of tagged Document objects with updated metadata.
    """
    tagged = []

    for doc in documents:
        content = doc.page_content.lower()

        # Match keywords against document content
        matched = []
        for kw in keywords:
            kw_lower = kw.lower()
            if fuzz.partial_ratio(kw_lower, content) >= threshold:
                matched.append(kw)

        # Add tags to metadata
        doc.metadata["matched_keywords"] = matched
        tagged.append(doc)

    return tagged

tagged_documents = tag_documents_with_keywords(documents, keywords)

In [None]:
import json

# Convert Document objects to dicts
doc_dicts = [doc.dict() for doc in tagged_documents]

with open("tagged_documents.json", "w", encoding="utf-8") as f:
    json.dump(doc_dicts, f, ensure_ascii=False, indent=2)


In [None]:
# load tagged documents from file
import json
from langchain_core.documents import Document
with open("tagged_documents.json", "r", encoding="utf-8") as f:
    tagged_documents = [Document(**doc) for doc in json.load(f)]

In [None]:
tagged_documents[50]

Fuzzy-match query to keywords

In [None]:
from rapidfuzz import fuzz

def find_keywords_in_prompt(prompt, keywords, threshold=80):
    """
    Returns all keywords that appear in the prompt using fuzzy matching.
    
    Args:
        prompt (str): The user prompt.
        keywords (list): List of keywords to match.
        threshold (int): Fuzzy match threshold (0-100).
        
    Returns:
        list: Matched keywords.
    """
    prompt_lower = prompt.lower()
    matched = []
    for kw in keywords:
        kw_lower = kw.lower()
        # Use partial_ratio for substring-like matching
        if fuzz.partial_ratio(kw_lower, prompt_lower) >= threshold:
            matched.append(kw)
    return matched

# Example usage:
# keywords = ["fever", "skin lesions", "swollen lymph nodes"]
# prompt = "The patient presents with fever and swollen nodes."
# print(find_keywords_in_prompt(prompt, keywords))

GPT to match query to keywords

In [None]:
from typing import List
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

class KeywordsOutput(BaseModel):
    keywords: List[str] = Field(description="List of relevant keywords extracted from the query")

def extract_keywords_with_gpt(query: str, known_keywords: List[str]) -> List[str]:
    parser = PydanticOutputParser(pydantic_object=KeywordsOutput)

    prompt = PromptTemplate(
        template="""
You are helping identify relevant medical concepts. 
Given this query: "{query}"

Select the most relevant keywords from this list:
{keyword_list}

Return the matching keywords as a JSON object with a single key "keywords" whose value is a list of strings.

{format_instructions}
""",
        input_variables=["query", "keyword_list"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    chain = LLMChain(
        llm=ChatOpenAI(temperature=0, model="gpt-4o"),
        prompt=prompt,
        output_parser=parser,
    )

    output = chain.run(query=query, keyword_list=", ".join(known_keywords))

    # output is a list of strings, not a KeywordsOutput instance
    return output.keywords




In [None]:
# matched_keywords = extract_keywords_with_gpt(query = "child presenting with lesions", known_keywords = keywords)
# print("Matched Keywords:", matched_keywords)
type(matched_keywords)



Hybrid search using matched keywords

In [None]:
def hybrid_search_with_query_keywords(query, vectorstore, documents, keyword_list, top_k=5):
    # Step 1: Semantic search
    semantic_hits = vectorstore.similarity_search(query, k=top_k)

    # Step 2: Use GPT to extract keywords from the query
    matched_keywords = extract_keywords_with_gpt(query, keyword_list)

    # Step 3: Filter docs whose metadata has any of those keywords
    keyword_hits = [
        doc for doc in documents
        if any(
            normalize_kw(kw1) == normalize_kw(kw2)
            for kw1 in doc.metadata.get("matched_keywords", [])
            for kw2 in matched_keywords
        )
    ]

    for kw in matched_keywords:
        print(f"Matched keyword: {kw}")

    # print metadata of keyword_hits
    for doc in keyword_hits:
        print(doc.metadata.get("disease_name"))
        print(doc.metadata.get("matched_keywords"))
        print(doc.page_content)

    # Step 4: Merge by unique content
    merged = {doc.page_content: doc for doc in semantic_hits + keyword_hits}
    return list(merged.values()), matched_keywords


In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embedding_model = OpenAIEmbeddings()

# `documents` is the list of LangChain Document objects from before
vectorstore = FAISS.from_documents(tagged_documents, embedding_model)

vectorstore.save_local("disease_vectorstore")

In [None]:
# Startup:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
vectorstore = FAISS.load_local("disease_vectorstore", OpenAIEmbeddings(),allow_dangerous_deserialization=True)

# Query time:
query = "child presenting with lesions"
results, matched = hybrid_search_with_query_keywords(query, vectorstore, tagged_documents, keywords)

# print("Matched keywords:", matched)
# for doc in results:
#     print("---")
#     print(doc.metadata.get("disease_name"))
#     print(doc.metadata.get("matched_keywords"))
#     print(doc.page_content)




In [None]:
# doc=tagged_documents[0].metadata.get("matched_keywords")
doc
# matched_keywords
# doc in matched_keywords

