# Retrieve Abnormal Template

Goal: Retrieve abnormal findings from abnormal template by each organs

In [8]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.runnables import RunnablePassthrough
# Local Package
from us_report_ext.findings import Findings

## Load Abnormal Findings Markdown Docs

In [9]:
# Load all markdown files from `abnormal/` directory
loader = DirectoryLoader("abnormal", glob="**/*.md", loader_cls=TextLoader)
docs_list = loader.load()

In [10]:
print(docs_list[0].page_content[0:250] )

# Kidney Findings


Order findings as:
1. Kidney size and echogenicity
2. (If any) Renal cyst(s)
3. (If any) Renal stone, hydronephrosis, or solid mass.

```markdown
**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.
```




In [11]:
from pathlib import Path

# Put into dictionary 
docs_names = [Path(doc.metadata["source"]).stem for doc in docs_list]
docs_dict = dict(zip(docs_names, docs_list))
docs_dict

{'abnormal_kidney': Document(page_content='# Kidney Findings\n\n\nOrder findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.\n\n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```\n\n\n### (Chronic) Parenchymatous Kidney Disease\n\nDefinition: \n\n"Parenchymatous kidney disease" := normal kidney size but increased echogenicity. \n"Chronic parenchymatous kidney disease" := small kidney size and increased echogenicity. \n\nIf one kidney is abnormal and the other is normal, report findings for each kidneys. \n\nHere is the format:\n\n```markdown\n**Kidneys:** `[Normal | Small]` size with (mildly) increased parenchymal echogenicity of the `[right | left | both]` kidney(s). No stone, hydronephrosis or solid mass.\n**IMPRESSION:**\n- (Chronic) parenchymatous disease of `[right | left | both]` kidney(s).\n```\n\nExamples:\n\n- Parenchymatous right kidney and normal left ki

## Split Markdown by headers

In [12]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=True
)
md_header_splits_dict = {name: markdown_splitter.split_text(doc.page_content) for name, doc in docs_dict.items()}
md_header_splits_dict

{'abnormal_kidney': [Document(page_content='Order findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.  \n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```', metadata={'Header 1': 'Kidney Findings'}),
  Document(page_content='Definition:  \n"Parenchymatous kidney disease" := normal kidney size but increased echogenicity.\n"Chronic parenchymatous kidney disease" := small kidney size and increased echogenicity.  \nIf one kidney is abnormal and the other is normal, report findings for each kidneys.  \nHere is the format:  \n```markdown\n**Kidneys:** `[Normal | Small]` size with (mildly) increased parenchymal echogenicity of the `[right | left | both]` kidney(s). No stone, hydronephrosis or solid mass.\n**IMPRESSION:**\n- (Chronic) parenchymatous disease of `[right | left | both]` kidney(s).\n```  \nExamples:  \n- Parenchymatous right kidney and normal left kidney:  \

## Function: `load_split_md_docs()`

In [1]:
from typing import Dict, List
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader, TextLoader

def load_split_md_docs(path: str) -> Dict[str, List[Document]]:
    """Load and Split Markdown Documents

    Args:
        path (str): path to folder containing markdown docs

    Returns:
        _dict_: Dictionary containing list of Documents
    """
    # Load all markdown files from `abnormal/` directory
    loader = DirectoryLoader(path = path, glob="**/*.md", loader_cls=TextLoader)
    docs_list = loader.load()
    
    ## Put into dictionary 
    docs_names = [Path(doc.metadata["source"]).stem for doc in docs_list]
    docs_dict = dict(zip(docs_names, docs_list))
    
    # Split
    ## Split Headings
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    ## MD splits
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    md_header_splits_dict = {name: markdown_splitter.split_text(doc.page_content) for name, doc in docs_dict.items()}
    return md_header_splits_dict

In [14]:
md_header_splits_dict = load_split_md_docs(path="abnormal/")
md_header_splits_dict

{'abnormal_kidney': [Document(page_content='Order findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.  \n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```', metadata={'Header 1': 'Kidney Findings'}),
  Document(page_content='Definition:  \n"Parenchymatous kidney disease" := normal kidney size but increased echogenicity.\n"Chronic parenchymatous kidney disease" := small kidney size and increased echogenicity.  \nIf one kidney is abnormal and the other is normal, report findings for each kidneys.  \nHere is the format:  \n```markdown\n**Kidneys:** `[Normal | Small]` size with (mildly) increased parenchymal echogenicity of the `[right | left | both]` kidney(s). No stone, hydronephrosis or solid mass.\n**IMPRESSION:**\n- (Chronic) parenchymatous disease of `[right | left | both]` kidney(s).\n```  \nExamples:  \n- Parenchymatous right kidney and normal left kidney:  \

## Create Vector Storage

In [15]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

chroma_dict = {name: Chroma.from_documents(md_header_splits, embedding=OpenAIEmbeddings()) 
               for name, md_header_splits in md_header_splits_dict.items()}

## Retriver

### Function: `get_chroma_retrievers()`

In [16]:
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings


def get_chroma_retrievers(docs_splits_dict: Dict[str, List[Document]],
                          search_type: str = "similarity",
                          search_kwargs: Dict[str, any] = {"k": 3},
                          embedding=OpenAIEmbeddings()
                          ) -> Dict[str, VectorStoreRetriever]:

    chroma_dict = {name: Chroma.from_documents(docs_splits, embedding=embedding)
                   for name, docs_splits in docs_splits_dict.items()}

    retriever_dict = {
        name: chroma.as_retriever(
            search_type=search_type,
            search_kwargs=search_kwargs)
        for name, chroma in chroma_dict.items()
    }

    return retriever_dict

In [17]:
retriever_dict = get_chroma_retrievers(md_header_splits_dict)
retriever_dict

{'abnormal_kidney': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1378b9150>, search_kwargs={'k': 3}),
 'abnormal_liver': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x117f0f6a0>, search_kwargs={'k': 3}),
 'abnormal_gallbladder': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1378bbc10>, search_kwargs={'k': 3})}

### HowTo: Retriever Single Doc

In [18]:
# Single
retriever_liver = chroma_dict["abnormal_liver"].as_retriever(
    search_type="similarity", search_kwargs={'k': 2}
    )
retriever_liver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x13740bc70>, search_kwargs={'k': 2})

In [19]:
retriever_liver.invoke("Search the following only in `metadata` field: Fatty")

[Document(page_content='#### Mild Fatty Liver  \n```markdown\n**Liver:** Normal size with mildly increased parenchymal echogenicity of the liver. No focal lesion.\n**IMPRESSION:**\n- Mild fatty liver without focal lesion.\n```  \n#### Moderate Fatty Liver  \n```markdown\n**Liver:** Normal size with diffusely increased parenchymal echogenicity of the liver, causing imparied visualization of intrahepatic vasculature. No focal lesion.\n**IMPRESSION:**\n- Moderate fatty liver without focal lesion.\n```  \n#### Severe Fatty Liver  \n```markdown\n**Liver:** Normal size with diffusely increased parenchymal echogenicity of the liver, causing imparied visualization of intrahepatic vasculature and right hemidiaphragm. No focal lesion.\n**IMPRESSION:**\n- Severe fatty liver without focal lesion.\n```  \n#### Focal Fat Sparing  \nIf focal fat sparing area is present, add the following line in the `liver` field after the fatty liver sentence.  \n```markdown\n**Liver:** <fatty_liver_findings>. Geogr

In [20]:
retriever_liver.invoke("Search the following only in `metadata` field: Paren")

[Document(page_content='```markdown\n**Liver:** Normal size and (mildly) `[increased | coarse]` parenchymal echogenicity. No focal lesion.\n**IMPRESSION:**\n- (Mild) parenchymatous disease of the liver without focal lesion.\n```', metadata={'Header 1': 'Liver Abnormal Findings', 'Header 3': 'Parenchymatous Liver Disease'}),
 Document(page_content='```markdown\n**Liver:** Normal size and (mildly) `[increased | coarse]` parenchymal echogenicity. No focal lesion.\n**IMPRESSION:**\n- (Mild) parenchymatous disease of the liver without focal lesion.\n```', metadata={'Header 1': 'Liver Abnormal Findings', 'Header 3': 'Parenchymatous Liver Disease'})]

### HowTo: Retriver Multi Docs

In [21]:
retriever_dict = {
    name: chroma.as_retriever(
    search_type="similarity",
    # For diversity
    search_kwargs={'k': 3},) 
    for name, chroma in chroma_dict.items()
                  }

retriever_dict

{'abnormal_kidney': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1372f2770>, search_kwargs={'k': 3}),
 'abnormal_liver': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x13740bc70>, search_kwargs={'k': 3}),
 'abnormal_gallbladder': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x14718c490>, search_kwargs={'k': 3})}

### Test Retrieve

In [22]:
retriever_dict["abnormal_gallbladder"].invoke("Search the following only in `metadata` field: Mild fatty liver")

[Document(page_content='#### Mild Fatty Liver  \n```markdown\n**Liver:** Normal size with mildly increased parenchymal echogenicity of the liver. No focal lesion.\n**IMPRESSION:**\n- Mild fatty liver without focal lesion.\n```  \n#### Moderate Fatty Liver  \n```markdown\n**Liver:** Normal size with diffusely increased parenchymal echogenicity of the liver, causing imparied visualization of intrahepatic vasculature. No focal lesion.\n**IMPRESSION:**\n- Moderate fatty liver without focal lesion.\n```  \n#### Severe Fatty Liver  \n```markdown\n**Liver:** Normal size with diffusely increased parenchymal echogenicity of the liver, causing imparied visualization of intrahepatic vasculature and right hemidiaphragm. No focal lesion.\n**IMPRESSION:**\n- Severe fatty liver without focal lesion.\n```  \n#### Focal Fat Sparing  \nIf focal fat sparing area is present, add the following line in the `liver` field after the fatty liver sentence.  \n```markdown\n**Liver:** <fatty_liver_findings>. Geogr

In [23]:
retriever_dict["abnormal_gallbladder"].invoke("Search the following only in `metadata` field: 2-mm left renal stone")

[Document(page_content='Order findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.  \n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```', metadata={'Header 1': 'Kidney Findings'}),
 Document(page_content='Order findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.  \n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```', metadata={'Header 1': 'Kidney Findings'}),
 Document(page_content='```markdown\n**Kidneys:** Normal size and parenchymal echogenicity of both kidneys. <quantifier> non-obstructing caliceal stone(s) at `[right | left | both]` kidney(s).  No hydronephrosis or solid mass.\n**IMPRESSION:**\n- <quantifier> non-obstructing caliceal stone(s) at `[right | left | both]` kidney(s)\n```  \nExamples:  \n```markdown\n**Kidneys:** Normal size 

## Retrive Abnormal Docs

### Function: `retrieve_abnormal_docs()`

In [49]:
from typing import Dict, List
import itertools
from langchain_core.documents import Document

def retrieve_abnormal_docs(retriever_dict: Dict[str, VectorStoreRetriever], findings: Findings) -> Dict[str, List[Document]]:
    
    out_dict = {}
    
    for key, retriever in retriever_dict.items():
        # Loop per organs
        query_list = findings.to_dict()[key]
        out_dict[key] = remove_duplicates(list(
            # Un-nest List
            itertools.chain(
                # Query for each item in findings
                *[retriever.invoke(f"Search only in the `metadata` field\n\nQuery: {query}") 
                for query in query_list]
            )
        ))
    
    return out_dict
    

# Helper
def remove_duplicates(objects):
    unique_objects = []
    for obj in objects:
        if obj not in unique_objects:
            unique_objects.append(obj)
    return unique_objects
    

In [48]:
abn_doc_dict1 = retrieve_abnormal_docs(retriever_dict, findings1)
abn_doc_dict1

{'abnormal_kidney': [Document(page_content='```markdown\n**Kidneys:** Normal size and parenchymal echogenicity of both kidneys. <quantifier> non-obstructing caliceal stone(s) at `[right | left | both]` kidney(s).  No hydronephrosis or solid mass.\n**IMPRESSION:**\n- <quantifier> non-obstructing caliceal stone(s) at `[right | left | both]` kidney(s)\n```  \nExamples:  \n```markdown\n**Kidneys:** Normal size and parenchymal echogenicity of both kidneys. A few non-obstructing caliceal stones at right kidney.  No hydronephrosis or solid mass.\n**IMPRESSION:**\n- A few non-obstructing caliceal stones at right kidney.\n```', metadata={'Header 1': 'Kidney Findings', 'Header 3': 'Renal Stone'}),
  Document(page_content='Order findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.  \n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```', metadata={'Header 1': 'Kidney Findings'}

### HowTo: Retrive Abnormal Docs

In [24]:
from us_report_ext.main import get_findings, load_split_md_docs
from us_report_ext.findings import Findings

In [25]:
# User Text to Extract
user_text1 = """Generate US report with these findings:
- Mild fatty liver
- 2-mm left renal stone, 5-mm right renal cyst
"""
findings1 = get_findings(input_text=user_text1)
findings1

Findings(abnormal_liver=[Liver(finding='Mild fatty liver')], abnormal_kidney=[Kidney(finding='2-mm left renal stone'), Kidney(finding='5-mm right renal cyst')], abnormal_gallbladder=[])

In [26]:
# Retriever
docs_dict = load_split_md_docs("abnormal")
retriever_dict = get_chroma_retrievers(docs_dict)
retriever_dict

{'abnormal_kidney': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1487e6f50>, search_kwargs={'k': 3}),
 'abnormal_liver': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1378fb220>, search_kwargs={'k': 3}),
 'abnormal_gallbladder': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1487e4e80>, search_kwargs={'k': 3})}

In [27]:
findings1.to_dict()

{'abnormal_liver': ['Mild fatty liver'],
 'abnormal_kidney': ['2-mm left renal stone', '5-mm right renal cyst'],
 'abnormal_gallbladder': []}

In [33]:
query = "Mild fatty liver"

pr_temp_retriever = f"Search only in the `metadata` field\n\nQuery: {query}"
retriever_dict["abnormal_liver"].invoke(pr_temp_retriever)

[Document(page_content='#### Mild Fatty Liver  \n```markdown\n**Liver:** Normal size with mildly increased parenchymal echogenicity of the liver. No focal lesion.\n**IMPRESSION:**\n- Mild fatty liver without focal lesion.\n```  \n#### Moderate Fatty Liver  \n```markdown\n**Liver:** Normal size with diffusely increased parenchymal echogenicity of the liver, causing imparied visualization of intrahepatic vasculature. No focal lesion.\n**IMPRESSION:**\n- Moderate fatty liver without focal lesion.\n```  \n#### Severe Fatty Liver  \n```markdown\n**Liver:** Normal size with diffusely increased parenchymal echogenicity of the liver, causing imparied visualization of intrahepatic vasculature and right hemidiaphragm. No focal lesion.\n**IMPRESSION:**\n- Severe fatty liver without focal lesion.\n```  \n#### Focal Fat Sparing  \nIf focal fat sparing area is present, add the following line in the `liver` field after the fatty liver sentence.  \n```markdown\n**Liver:** <fatty_liver_findings>. Geogr

In [44]:
import itertools

out_dict = {}

for key, retriever in retriever_dict.items():
    print(key,  retriever)
    query_list = findings1.to_dict()[key]
    out_dict[key] = remove_duplicates(list(
        # Un-nest List
        itertools.chain(
            *[retriever.invoke(f"Search only in the `metadata` field\n\nQuery: {query}") 
              for query in query_list]
        )
    ))
    
out_dict

abnormal_kidney tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1487e6f50> search_kwargs={'k': 3}
abnormal_liver tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1378fb220> search_kwargs={'k': 3}
abnormal_gallbladder tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1487e4e80> search_kwargs={'k': 3}


{'abnormal_kidney': [Document(page_content='```markdown\n**Kidneys:** Normal size and parenchymal echogenicity of both kidneys. <quantifier> non-obstructing caliceal stone(s) at `[right | left | both]` kidney(s).  No hydronephrosis or solid mass.\n**IMPRESSION:**\n- <quantifier> non-obstructing caliceal stone(s) at `[right | left | both]` kidney(s)\n```  \nExamples:  \n```markdown\n**Kidneys:** Normal size and parenchymal echogenicity of both kidneys. A few non-obstructing caliceal stones at right kidney.  No hydronephrosis or solid mass.\n**IMPRESSION:**\n- A few non-obstructing caliceal stones at right kidney.\n```', metadata={'Header 1': 'Kidney Findings', 'Header 3': 'Renal Stone'}),
  Document(page_content='Order findings as:\n1. Kidney size and echogenicity\n2. (If any) Renal cyst(s)\n3. (If any) Renal stone, hydronephrosis, or solid mass.  \n```markdown\n**Kidneys:** <kidney_size_echo>. <renal_cyst>. <renal_stone_hydro_solid_mass>.\n```', metadata={'Header 1': 'Kidney Findings'}