In [1]:
import csv

condition_mapping_file = "../../resources/CCSCM.csv"
procedure_mapping_file = "../../resources/CCSPROC.csv"
drug_file = "../../resources/ATC.csv"

condition_dict = {}
with open(condition_mapping_file, newline='', encoding='utf-8', errors='replace') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        condition_dict[row['code']] = row['name'].lower()

procedure_dict = {}
with open(procedure_mapping_file, newline='', encoding='utf-8', errors='replace') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        procedure_dict[row['code']] = row['name'].lower()

drug_dict = {}
with open(drug_file, newline='', encoding='utf-8', errors='replace') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['level'] == '3.0':
            drug_dict[row['code']] = row['name'].lower()


In [2]:
import re 
from ChatGPT import ChatGPT
from ChatGPT import ChatGPT
import json

def extract_data_in_brackets(input_string):
    pattern = r"\[(.*?)\]"
    matches = re.findall(pattern, input_string)
    return matches

def divide_text(long_text, max_len=800):
    sub_texts = []
    start_idx = 0
    while start_idx < len(long_text):
        end_idx = start_idx + max_len
        sub_text = long_text[start_idx:end_idx]
        sub_texts.append(sub_text)
        start_idx = end_idx
    return sub_texts

def filter_triples(triples):
    chatgpt = ChatGPT()
    response = chatgpt.chat(
        f"""
            I have a list of triples. I want to select 50 most important triples from the list.
            The importance of a triple is based on how you think it will help imrpove healthcare prediction tasks (e.g., drug recommendation, mortality prediction, readmission prediction …).
            If you think a triple is important, please keep it. Otherwise, please remove it.
            You can also add triples from your background knowledge.
            The total size of the updated list should be below 50.

            triples: {triples}
            updates:
        """
        )
    json_string = str(response)
    json_data = json.loads(json_string)

    filtered_triples = extract_data_in_brackets(json_data['content'])
    return filtered_triples


In [3]:
from ChatGPT import ChatGPT
import json

def graph_gen(term: str, mode: str):
    if mode == "condition":
        example = \
        """
        Example:
        prompt: systemic lupus erythematosus
        updates: [[systemic lupus erythematosus, is an, autoimmune condition], [systemic lupus erythematosus, may cause, nephritis], [anti-nuclear antigen, is a test for, systemic lupus erythematosus], [systemic lupus erythematosus, is treated with, steroids], [methylprednisolone, is a, steroid]]
        """
    elif mode == "procedure":
        example = \
        """
        Example:
        prompt: endoscopy
        updates: [[endoscopy, is a, medical procedure], [endoscopy, used for, diagnosis], [endoscopic biopsy, is a type of, endoscopy], [endoscopic biopsy, can detect, ulcers]]
        """
    elif mode == "drug":
        example = \
        """
        Example:
        prompt: iobenzamic acid
        updates: [[iobenzamic acid, is a, drug], [iobenzamic acid, may have, side effects], [side effects, can include, nausea], [iobenzamic acid, used as, X-ray contrast agent], [iobenzamic acid, formula, C16H13I3N2O3]]
        """
    chatgpt = ChatGPT()
    response = chatgpt.chat(
        f"""
            Given a prompt (a medical condition/procedure/drug), extrapolate as many relationships as possible of it and provide a list of updates.
            The relationships should be helpful for healthcare prediction (e.g., drug recommendation, mortality prediction, readmission prediction …)
            Each update should be exactly in format of [ENTITY 1, RELATIONSHIP, ENTITY 2]. The relationship is directed, so the order matters.
            Both ENTITY 1 and ENTITY 2 should be noun.
            Any element in [ENTITY 1, RELATIONSHIP, ENTITY 2] should be conclusive, make it as short as possible.
            Do this in both breadth and depth. Expand [ENTITY 1, RELATIONSHIP, ENTITY 2] until the size reaches 100.

            {example}

            prompt: {term}
            updates:
        """
        )
    json_string = str(response)
    json_data = json.loads(json_string)

    triples = extract_data_in_brackets(json_data['content'])
    outstr = ""
    for triple in triples:
        outstr += triple.replace('[', '').replace(']', '').replace(', ', '\t') + '\n'

    return outstr

In [4]:
## Future work - Including Clinical Notes
# import json

# with open('../../clinical_notes/subject_text_dict.json', 'r') as f:
#     subject_text_dict = json.load(f)

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os

# adjust depending on how heavy graph_gen is / API rate limits
MAX_WORKERS = 30

In [None]:
def process_condition(key: str):
    """Process a single condition key: read existing file, maybe call graph_gen, write back."""
    file_path = f'../../graphs/condition/CCSCM/{key}.txt'
    term = condition_dict[key]

    # Case 1: file already exists
    if os.path.exists(file_path):
        with open(file_path, mode="r", encoding="utf-8") as f:
            prev_triples = f.read()

        # Only extend if less than 100 lines
        if len(prev_triples.splitlines()) < 100:
            outstr = graph_gen(term=term, mode="condition")
            outstr = prev_triples + outstr
            with open(file_path, mode="w", encoding="utf-8") as f:
                f.write(outstr)

    # Case 2: file does not exist
    else:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        outstr = graph_gen(term=term, mode="condition")
        with open(file_path, mode="w", encoding="utf-8") as f:
            f.write(outstr)

    return key  # just to have something to mark completion


keys = list(condition_dict.keys())

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_condition, key) for key in keys]
    for _ in tqdm(as_completed(futures), total=len(futures)):
        # nothing to do here, tqdm is just tracking completion
        pass

  0%|          | 0/285 [00:00<?, ?it/s]

In [None]:
def process_drug(key: str):
    """Process a single drug key: read existing file, maybe call graph_gen, write back."""
    file_path = f'../../graphs/drug/ATC5/{key}.txt'
    term = drug_dict[key]

    if os.path.exists(file_path):
        with open(file_path, mode="r", encoding="utf-8") as f:
            prev_triples = f.read()

        # Only extend if less than 150 lines
        if len(prev_triples.splitlines()) < 150:
            outstr = graph_gen(term=term, mode="drug")
            outstr = prev_triples + outstr
            with open(file_path, mode="w", encoding="utf-8") as f:
                f.write(outstr)
    else:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        outstr = graph_gen(term=term, mode="drug")
        with open(file_path, mode="w", encoding="utf-8") as f:
            f.write(outstr)

    return key  # for completion tracking


drug_keys = list(drug_dict.keys())

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_drug, key) for key in drug_keys]
    for _ in tqdm(as_completed(futures), total=len(futures)):
        # Just to drive tqdm; nothing else needed here
        pass

In [None]:
def process_drug(key: str):
    """Handle one drug entry: read file, maybe call graph_gen, write back."""
    file_path = f'../../graphs/drug/ATC5/{key}.txt'
    term = drug_dict[key]

    if os.path.exists(file_path):
        # read existing content
        with open(file_path, mode="r", encoding="utf-8") as f:
            prev_triples = f.read()

        # only extend if <150 lines
        if len(prev_triples.splitlines()) < 150:
            outstr = graph_gen(term=term, mode="drug")
            outstr = prev_triples + outstr
            with open(file_path, mode="w", encoding="utf-8") as f:
                f.write(outstr)

    else:
        # create directory if needed, then write new content
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        outstr = graph_gen(term=term, mode="drug")
        with open(file_path, mode="w", encoding="utf-8") as f:
            f.write(outstr)

    return key  # just for completion tracking


drug_keys = list(drug_dict.keys())

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_drug, key) for key in drug_keys]
    for _ in tqdm(as_completed(futures), total=len(futures)):
        # tqdm tracks completion, no extra work needed here
        pass

In [None]:
def process_drug_atc3(key: str):
    """Process a single drug key for ATC3: read existing file, maybe call graph_gen, write back."""
    file_path = f'../../graphs/drug/ATC3/{key}.txt'
    term = drug_dict[key]

    if os.path.exists(file_path):
        # Read existing triples
        with open(file_path, mode="r", encoding="utf-8") as f:
            prev_triples = f.read()

        # Only extend if less than 150 lines
        if len(prev_triples.splitlines()) < 150:
            outstr = graph_gen(term=term, mode="drug")
            outstr = prev_triples + outstr
            with open(file_path, mode="w", encoding="utf-8") as f:
                f.write(outstr)
    else:
        # Create directory if needed and write new content
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        outstr = graph_gen(term=term, mode="drug")
        with open(file_path, mode="w", encoding="utf-8") as f:
            f.write(outstr)

    return key  # for completion tracking


drug_keys = list(drug_dict.keys())

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_drug_atc3, key) for key in drug_keys]
    for _ in tqdm(as_completed(futures), total=len(futures)):
        # tqdm just tracks completion
        pass