In [196]:
import pandas as pd
import ast
import numpy as np
import os
from pathlib import Path
from src.Snomed import Snomed
import json
import pandas as pd
import networkx as nx
from tqdm import tqdm

# SNOMED Hierarchy

In [203]:
release_id = '20240401'
SNOMED_PATH = './data/snomed/SnomedCT_InternationalRF2_PRODUCTION_20240401T120000Z' # you need to download your own SNOMED distribution
snomed = Snomed(SNOMED_PATH, release_id=release_id)
snomed.load_snomed()
file_path = f'{SNOMED_PATH}/Snapshot/Terminology/sct2_Relationship_Snapshot_INT_{release_id}.txt'

snomed_sf_id_pairs = []
for snomed_id in tqdm(snomed.graph.nodes):
    node_descs = snomed.index_definition[snomed_id]
    for d in node_descs:
        snomed_sf_id_pairs.append((d, snomed_id))
        
all_names = [p[0] for p in snomed_sf_id_pairs]
all_ids = [p[1] for p in snomed_sf_id_pairs]

data = pd.read_csv(file_path, delimiter='\t')
# Filter data for active=1 and typeId=116680003 (IS_A relationships)
filtered_data = data[(data['active'] == 1) & (data['typeId'] == 116680003)]

# Create a directed graph
G = nx.DiGraph()

# Add edges from sourceId to destinationId
all_ids_set = set(map(int, all_ids))

for idx, row in filtered_data.iterrows():
    source_id = int(row['sourceId'])
    destination_id = int(row['destinationId'])
    # Filter for disorder and substance nodes using the set for faster lookup
    if (source_id in all_ids_set) and (destination_id in all_ids_set):
        G.add_edge(source_id, destination_id)

# Generate a dictionary of all nodes with their reachable nodes
#all_hierarchies = {node: list(nx.descendants(G, node)) for node in G.nodes}

# Generate a dictionary of all nodes with their reachable and originating nodes
node_list = []
for node in G.nodes:
    descendants = list(nx.descendants(G, node))  # Nodes reachable from 'node'
    ancestors = list(nx.ancestors(G, node))  # Nodes that can reach 'node'
    node_list.append([node, descendants, ancestors])

# Create DataFrame from the list
hierarchies_df = pd.DataFrame(node_list, columns=['Node', 'Parents', 'Children'])


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 366908/366908 [00:00<00:00, 2480330.75it/s]


In [204]:
hierarchies_df.head()

Unnamed: 0,Node,Parents,Children
0,10000006,"[302292003, 404684003, 9972008, 298705000, 222...",[]
1,29857009,"[302292003, 404684003, 298705000, 22253000, 10...","[279019008, 83264000, 279038004, 1264062004, 3..."
2,9972008,"[404684003, 22253000, 102957003, 106147001, 13...","[10000006, 427653003, 427365005, 426469008, 12..."
3,134035007,"[421371008, 312779009, 69536005, 47173002, 281...",[]
4,84371003,"[421371008, 312779009, 69536005, 47173002, 281...",[134035007]


In [205]:
def get_description(snomed_obj, id):
    try:
        return snomed_obj[id]['desc']  # Modify this based on how descriptions are stored in your Snomed object
    except KeyError:
        return 'Unknown'

# Apply mapping to Node, Descendants, and Ancestors columns
hierarchies_df['Node_Desc'] = hierarchies_df['Node'].apply(lambda x: get_description(snomed, x))
hierarchies_df['Parents_Desc'] = hierarchies_df['Parents'].apply(lambda ids: [get_description(snomed, id) for id in ids])
hierarchies_df['Children_Desc'] = hierarchies_df['Children'].apply(lambda ids: [get_description(snomed, id) for id in ids])


In [206]:
hierarchies_df.to_csv("data/snomed/disorder_substance_snomed_node_hierarchies.csv")


In [207]:
hierarchies_df.rename(columns={"Node":"snomed_termid"}, inplace=True)
hierarchies_df.head()

Unnamed: 0,snomed_termid,Parents,Children,Node_Desc,Parents_Desc,Children_Desc
0,10000006,"[302292003, 404684003, 9972008, 298705000, 222...",[],Radiating chest pain,"[Finding of trunk structure, Clinical finding ...",[]
1,29857009,"[302292003, 404684003, 298705000, 22253000, 10...","[279019008, 83264000, 279038004, 1264062004, 3...",Chest pain,"[Finding of trunk structure, Clinical finding ...","[Central crushing chest pain, Epidemic pleurod..."
2,9972008,"[404684003, 22253000, 102957003, 106147001, 13...","[10000006, 427653003, 427365005, 426469008, 12...",Radiating pain,"[Clinical finding (finding), Pain, Neurologica...","[Radiating chest pain, Pain radiating to right..."
3,134035007,"[421371008, 312779009, 69536005, 47173002, 281...",[],Entire stylomastoid foramen,"[Structure of lateral half of head, Bone struc...",[]
4,84371003,"[421371008, 312779009, 69536005, 47173002, 281...",[134035007],Stylomastoid foramen,"[Structure of lateral half of head, Bone struc...",[Entire stylomastoid foramen]


In [65]:
dict_canoncial_json_file_path = './data/snomed/mapping_dictionaries/disorder_substance_canonical_dict.json'

# Read the dictionary back from the JSON file
with open(dict_canoncial_json_file_path, 'r') as json_file:
    canonical_mapping_dict = json.load(json_file)

# Load and Map SNOMED Linked Data

In [None]:
data_path = Path("data/annotated_aact/snomed_linking_outputs/")


In [None]:
from typing import List, Dict, Callable, Set
def convert_to_int_list(lst):
    if isinstance(lst, float) and np.isnan(lst):
        return []
    if isinstance(lst, list):
        return [int(item) for item in lst]
    else:
        return [int(item) for item in ast.literal_eval(lst)]
        
def process_hierarchy(
    df: pd.DataFrame, 
    hierarchies_df: pd.DataFrame, 
    entity_type: str, 
    model_prefix: str,
    col_name_prefix_with_original_annotations: str,
    canonical_mapping_dict: Dict[str, str], 
    excluded_nodes: Set[int]
) -> pd.DataFrame:
    """
    Processes the hierarchy of entities in a DataFrame and merges it with another hierarchy DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the entities.
    - hierarchies_df (pd.DataFrame): The hierarchy DataFrame containing SNOMED term IDs and their parents.
    - entity_type (str): The type of entity to process (e.g., "conditions").
    - model_prefix (str): The prefix used for model-specific column names.
    - canonical_mapping_dict (Dict[str, str]): A dictionary mapping SNOMED term IDs to their canonical forms.
    - excluded_nodes (Set[int]): A set of SNOMED term IDs to exclude from the processing.

    Returns:
    - pd.DataFrame: The processed DataFrame with hierarchical information and canonical forms.
    """
    
    columns_to_explode = [f'{col_name_prefix_with_original_annotations}_{entity_type}', f'{model_prefix}_snomed_termid_{entity_type}', f'{model_prefix}_snomed_term_canonical_{entity_type}', f'{model_prefix}_cdist_{entity_type}']
    df_to_map = df.copy()

    for col in columns_to_explode:
        # Convert column to string, but handle None correctly
        df_to_map[col] = df_to_map[col].apply(lambda x: x.split('|') if pd.notnull(x) else [])
    df_to_map = df_to_map[['nct_id'] + columns_to_explode]
    df_exploded = df_to_map.explode(columns_to_explode).reset_index(drop=True)
    df_exploded.dropna(subset=[f'{model_prefix}_snomed_termid_{entity_type}'], inplace=True)
    df_exploded.rename(columns={f'{model_prefix}_snomed_termid_{entity_type}': 'snomed_termid'}, inplace=True)

    # merge mapped annotations with the hierarchical nodes representations
    df_exploded['snomed_termid'] = df_exploded['snomed_termid'].astype(str)
    hierarchies_df['snomed_termid'] = hierarchies_df['snomed_termid'].astype(str)
    df_flat_with_hierarchy = pd.merge(df_exploded, hierarchies_df, on='snomed_termid', how='left')

    # make sure the ids are intergers for faster processing
    df_flat_with_hierarchy['Parents'] = df_flat_with_hierarchy['Parents'].apply(convert_to_int_list)

    # get all nodes in the current dataframe and exclude generic entities, which should not be used as parents
    node_set = set(map(int, df_flat_with_hierarchy['snomed_termid'].dropna())) - set(excluded_nodes)

    # Filter Descendants based on whether they appear in the Node column
    df_flat_with_hierarchy['Filtered_Parents'] = df_flat_with_hierarchy['Parents'].apply(lambda x: [item for item in x if item in node_set])

    # The elements which have no parent, i.e. they are at the highest level in their hierarchy tree
    main_parent_nodes = df_flat_with_hierarchy[df_flat_with_hierarchy['Filtered_Parents'].apply(lambda x: len(x) == 0)]['snomed_termid']
    main_parent_nodes_set = set(map(int, main_parent_nodes))

    df_flat_with_hierarchy['Top_Concept'] = df_flat_with_hierarchy['Filtered_Parents'].apply(
        lambda x: [node for node in x if node in main_parent_nodes_set] if len(x) > 1 else x # if there is already only one parent node, keep it
    )

    df_flat_with_hierarchy['Top_Concept_Canonical'] = df_flat_with_hierarchy['Top_Concept'].apply(
        lambda x: [canonical_mapping_dict.get(str(node), "Canonical form not found") for node in x]
    )
    # Create a new column with the first element from Top_Concept_Canonical
    df_flat_with_hierarchy['Top_Concept_Canonical_First'] = df_flat_with_hierarchy['Top_Concept_Canonical'].apply(
        lambda x: x[0] if x else None
    )
    # Fill empty values in Top_Concept_Canonical_First with values from snomed_term_norm
    df_flat_with_hierarchy['Top_Concept_Canonical_First'] = df_flat_with_hierarchy['Top_Concept_Canonical_First'].fillna(
        df_flat_with_hierarchy[f'{model_prefix}_snomed_term_canonical_{entity_type}']
    )

    return df_flat_with_hierarchy

In [None]:
def save_mapped_relevant_cols_and_aggregated(df_flat_with_hierarchy, columns_to_save, model_prefix, entity_type):
    hierarchies_multiple_top = df_flat_with_hierarchy[df_flat_with_hierarchy['Top_Concept'].apply(lambda x: len(x) > 1)]
    number_mapped_trials = len(set(df_flat_with_hierarchy['nct_id']))
    number_mapped_trials_multi = len(set(hierarchies_multiple_top['nct_id']))

    df_flat_with_hierarchy[columns_to_save].to_csv(data_path / f"mapped_to_hierarchy/hierarchical_mapping_to_snomed_{model_prefix}_{entity_type}_flat_{number_mapped_trials}.csv")
    hierarchies_multiple_top[columns_to_save].to_csv(data_path/ f"mapped_to_hierarchy/hierarchical_mapping_to_snomed_{model_prefix}_{entity_type}_multi_top_flat_{number_mapped_trials_multi}.csv")

    # AGGREGATED VERSION
    hierarchies_to_use = df_flat_with_hierarchy[['nct_id',f'{model_prefix}_snomed_term_canonical_{entity_type}', 'Top_Concept_Canonical_First']]

    # Group by 'nct_id' and concatenate the values with "|"
    df_cond_mapped = hierarchies_to_use.groupby('nct_id').agg(lambda x: '|'.join(x.unique())).reset_index()
    df_cond_mapped.to_csv(data_path/ f"mapped_to_hierarchy/hierarchical_mapping_to_snomed_{model_prefix}_{entity_type}_aggregated_{len(df_cond_mapped)}.csv")


## BioLinkBERT

In [192]:
model_prefix = "linkbert"
entity_types = ["conditions", "interventions"]
entity_type = "conditions"
col_name_prefix_with_original_annotations = "canonical_BioLinkBERT-base"

In [193]:
df_linkbert_interv = pd.read_csv(data_path/ "sapbert_normalized_annotations_linkbert_19632_interventions.csv")[['nct_id', 'canonical_BioLinkBERT-base_interventions','linkbert_snomed_term_interventions',	'linkbert_snomed_termid_interventions',	'linkbert_snomed_term_canonical_interventions', 	'linkbert_cdist_interventions']]
df_linkbert_cond = pd.read_csv(data_path/ "sapbert_normalized_annotations_linkbert_19632_conditions.csv")[['nct_id', 'canonical_BioLinkBERT-base_conditions', 'linkbert_snomed_term_conditions',	'linkbert_snomed_termid_conditions',	'linkbert_snomed_term_canonical_conditions', 'linkbert_cdist_conditions']]

dfs_to_process = [df_linkbert_cond, df_linkbert_interv]

In [190]:
df_linkbert_cond.shape, df_linkbert_interv.shape

((19632, 6), (19632, 6))

In [170]:
# Convert Node column to set for faster lookup
# exclude the very generic 
# 118940003: Disorder of nervous system (disorder) 
# 49601007: Disorder of cardiovascular system (disorder)
# 62914000: Cerebrovascular disease (disorder)
# 127294003: Traumatic or nontraumatic brain injury (disorder)
# 128139000: Inflammatory disorder (disorder)
# 52448006: Dementia (disorder)
# 64572001: Disease (disorder)
# 362975008: Degenerative disorder (disorder)
# 363171009: Inflammation of specific body systems (disorder)
# 23853001: Disorder of the central nervous system (disorder)
# 414029004: Disorder of immune function (disorder)
# 6118003: Demyelinating disease of central nervous system
# 54767005: Disorder of visual pathways (disorder)
# 39367000: Inflammatory disease of the central nervous system
# 386033004: Neuropathy (disorder)
# 81308009: Disorder of brain (disorder)
# 52522001: Degenerative brain disorder (disorder)
# 74732009: Mental disorder (disorder)
# 105590001: Substance (substance)
# 410942007: Drug or medicament (substance)
# 69322001: Psychotic disorder (disorder)
# 302049001: Sequelae of disorders (disorder)
# 70835005: Disorder of basal ganglia (disorder)
# 60342002: Movement disorder (disorder)
# 782964007: Genetic disease (disorder)
# 115668003: Biological substance (substance)
# 260786008: Natural material (substance)
# 76349003: Extrapyramidal disease (disorder)
# 301766008: Lesion of brain (disorder)
# 417163006: Traumatic or non-traumatic injury (disorder)
# 82271004: Injury of head (disorder)
# 128239009: Injury of nervous system (disorder)
# 128126004: Injury of central nervous system (disorder)

excluded_nodes = {128126004, 128239009, 82271004, 417163006, 301766008, 76349003, 115668003, 260786008, 782964007, 70835005, 60342002, 302049001, 69322001, 410942007, 118940003, 49601007, 62914000, 127294003, 128139000, 52448006, 64572001, 362975008, 363171009, 23853001, 414029004, 6118003, 54767005, 39367000, 386033004, 81308009, 52522001, 74732009, 105590001}


In [195]:
for entity_type, df_to_process in zip(entity_types, dfs_to_process):
    print("Processing: ", entity_type)
    # Map to hierarchy
    df_flat_with_hierarchy = process_hierarchy(
        df_to_process,
        hierarchies_df,
        entity_type, 
        model_prefix,
        col_name_prefix_with_original_annotations,
        canonical_mapping_dict, 
        excluded_nodes
    )
    # Save results
    columns_to_save = ['nct_id',
         f'{col_name_prefix_with_original_annotations}_{entity_type}',
         'snomed_termid',
         f'{model_prefix}_snomed_term_canonical_{entity_type}',
         f'{model_prefix}_cdist_{entity_type}',
         'Top_Concept_Canonical',
         'Top_Concept_Canonical_First']
    save_mapped_relevant_cols_and_aggregated(df_flat_with_hierarchy, columns_to_save, model_prefix, entity_type)

Processing:  conditions
Processing:  interventions


# Explore mapping dictionaries

In [662]:
merged_linkbert_to_inspect = merged_biolinkbert[['nct_id', 'canonical_BioLinkBERT-base_interventions', 'linkbert_snomed_term_norminterventions', 'linkbert_cdist_interventions', 'canonical_BioLinkBERT-base_conditions_condition', 'linkbert_snomed_term_norm_condition', 'linkbert_cdist_condition']]

# Create a mask for filtering
merged_linkbert_to_inspect = merged_linkbert_to_inspect[(merged_linkbert_to_inspect['linkbert_cdist_condition'] != '{}') & (merged_linkbert_to_inspect['linkbert_cdist_interventions'] != '{}')]

# Create a mask for filtering
def has_positive_value(value):
    if isinstance(value, str):
        return any(float(i) > 0.0 for i in value.split('|'))
    elif isinstance(value, float):
        return value > 0.0
    else:
        return False

mask = (
    (
        merged_linkbert_to_inspect['linkbert_cdist_interventions'].apply(has_positive_value)
    ) 
    | (
        merged_linkbert_to_inspect['linkbert_cdist_condition'].apply(has_positive_value)
    )
)

# Apply the mask to filter the DataFrame
filtered_df_linkbert_low_confidence = merged_linkbert_to_inspect[mask]
filtered_df_linkbert_low_confidence.to_csv("data/annotated_aact/sapbert_snomed_linkbert_low_confidence_cases.csv")
filtered_df_linkbert_low_confidence.head()

Unnamed: 0,nct_id,canonical_BioLinkBERT-base_interventions,linkbert_snomed_term_norminterventions,linkbert_cdist_interventions,canonical_BioLinkBERT-base_conditions_condition,linkbert_snomed_term_norm_condition,linkbert_cdist_condition
0,NCT00000117,intravenous immunoglobulin|ivig,Administration of immunoglobulin by intravenou...,6.7164|9.2843,multiple sclerosis|optic neuritis,Multiple sclerosis|Optic neuritis,0.0|0.0
1,NCT00000146,corticosteroid,Corticosteroid and/or corticosteroid derivativ...,7.4138,multiple sclerosis|optic neuritis,Multiple sclerosis|Optic neuritis,0.0|0.0
2,NCT00000147,corticosteroid,Corticosteroid and/or corticosteroid derivativ...,7.4138,multiple sclerosis|optic neuritis,Multiple sclerosis|Optic neuritis,0.0|0.0
3,NCT00000151,acetylsalicylic acid|aspirin,Aspirin (substance)|Aspirin (substance),0.0|0.0,blindness|diabetes mellitus|diabetic retinopat...,Legal blindness|Diabetes mellitus|Diabetic ret...,8.1673|0.0|0.0|7.5725|7.2826
4,NCT00000170,atropine,Atropine (substance),0.0,amblyopia|anisometropia|moderate amblyopia|str...,Amblyopia (disorder)|Anisometropia|Moderate vi...,0.0|0.0|10.2775|0.0


In [639]:
import json

# Assuming the JSON data is stored in a file named 'data.json'
file_path = 'data/snomed/mapping_dictionaries/linkbert_combined_norm_to_term_dict_19607.json'

# Read the JSON data from the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Initialize a list to store the rows of the table
table_data = []

# Iterate through the JSON data and create rows for the table
for term, mappings in json_data.items():
    mappings_count = len(mappings)
    row = {'Mapped to SNOMED Term': term, 'Entities': mappings, 'Number of Mappings': mappings_count}
    table_data.append(row)

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(table_data)

In [61]:
df_sorted = df.sort_values(by='Number of Mappings', ascending=False)
df_sorted

Unnamed: 0,Mapped to SNOMED Term,Entities,Number of Mappings
6119,Sensory disorder of smell and/or taste,"[disruptions in taste and smell function, disr...",4
4271,Myasthenia gravis with exacerbation (disorder),"[myasthenia gravis exacerbation, mg exacerbati...",4
1813,Abdominal migraine,"[fasting-induced migraine, fasting-induced mig...",4
203,Stiff-man syndrome,"[stiff-person syndrome, glycine receptor antib...",4
950,Impaired ability to learn new material,"[new learning and memory deficits, deficits in...",3
...,...,...,...
2236,Maternal infection,[intrauterine infection],1
2235,Psychomotor agitation,[extreme agitation state],1
2234,Kabuki make-up syndrome,[kabuki syndrome],1
2233,Osteosarcoma,[osteosarcoma],1


In [62]:
df_sorted.to_csv("data/annotated_aact/sapbert_snomed_linkbert_statistics.csv")