In [1]:
import pandas as pd
from functools import reduce
from snomed_graph import *

In [2]:
# National Extensions
KOREAN_REFSET_PATH = "./data/snomed_extensions/snomed-MAIN_SNOMEDCT-KR-20240611/SnomedCT_Export/Delta/Refset/Language/der2_cRefset_Language21000267104Delta_KR_20240611.txt"
KOREAN_DESCRIPTION_PATH = "./data/snomed_extensions/snomed-MAIN_SNOMEDCT-KR-20240611/SnomedCT_Export/Delta/Terminology/sct2_Description_Delta_KR_20240611.txt"
SWEDISH_REFSET_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceSE_PRODUCTION_SE1000052_20240531T120000Z/Snapshot/Refset/Language/der2_cRefset_LanguageSnapshot-sv_SE1000052_20240531.txt"
SWEDISH_DESCRIPTION_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceSE_PRODUCTION_SE1000052_20240531T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-sv_SE1000052_20240531.txt"
DUTCH_REFSET_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceNL_PRODUCTION_NL1000146_20240331T120000Z/Snapshot/Refset/Language/der2_cRefset_LanguageSnapshot-nl_NL1000146_20240331.txt"
DUTCH_DESCRIPTION_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceNL_PRODUCTION_NL1000146_20240331T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-nl_NL1000146_20240331.txt"
ESTONIAN_REFSET_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceEE_PRODUCTION_EE1000181_20240530T120000Z/Snapshot/Refset/Language/der2_cRefset_LanguageSnapshot-et_EE1000181_20240530.txt"    
ESTONIAN_DESCRIPTION_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceEE_PRODUCTION_EE1000181_20240530T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-et_EE1000181_20240530.txt"

# Serialized graph containing international edition
SNOMED_GRAPH_PATH = "../snomed_graph/full_concept_graph.gml"

# Where we store a pivot table of summarised results
SUMMARY_PATH = "available_translated_concepts_summary.csv"

# Single file containing all translations
ALL_TRANSLATIONS_PATH = "./data/prepared_translation_data/all_translations.csv"

# Sample of concepts to translate
SAMPLE_PATH = "./data/prepared_translation_data/samples.csv"

In [2]:
G = SnomedGraph.from_serialized(SNOMED_GRAPH_PATH)

SNOMED graph has 361179 vertices and 1179749 edges


In [33]:
def load_translation(G, desc_path, lang_path):
    # Load the concept descriptions
    desc_df = pd.read_csv(desc_path, delimiter="\t", encoding='utf-8')
    # Load the language refset
    lang_df = pd.read_csv(lang_path, delimiter="\t", encoding='utf-8')
    # Filter the refset to Preferred Terms only
    lang_df = lang_df[lang_df.acceptabilityId == 900000000000548007]
    # IDs of all descriptors which are preferred terms
    preferred_term_descriptor_ids = lang_df.referencedComponentId.unique()
    # Filter descriptions to active concepts only
    desc_df = desc_df[desc_df.active == 1]
    # Filter to preferred terms
    desc_df = desc_df[desc_df.id.isin(preferred_term_descriptor_ids)]
    # Remove FSNs
    desc_df = desc_df[desc_df.typeId != 900000000000003001]
    # Some extensions include English terms.  We don't want these.
    desc_df = desc_df[desc_df.languageCode != "en"]
    # Remove concepts that don't exist in the International Edition
    desc_df = desc_df[[sctid in G for sctid in desc_df.conceptId]]
    desc_df = desc_df.rename(axis="columns", mapper={"conceptId": "sctid"})
    # One row per concept, with the synonyms aggregated into a list
    desc_df = desc_df.groupby("sctid").term.apply(list).rename("translations").to_frame()
    return desc_df

In [34]:
ee_df = load_translation(G, ESTONIAN_LANGUAGE_PATH, ESTONIAN_REFSET_PATH)
print(ee_df.shape[0])
ee_df.sample(3)

16684


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
2472002,[Anuuria]
129252005,[Suprapuubiline tsüstostoomia]
65266007,"[Süva ingvinaalne lümfisõlm, struktuur]"


In [35]:
nl_df = load_translation(G, DUTCH_DESCRIPTION_PATH, DUTCH_REFSET_PATH)
print(nl_df.shape[0])
nl_df.sample(3)

264396


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
167719009,[lymfocytose van liquor cerebrospinalis]
372537003,[topo-isomeraseremmer]
387527005,[pralidoxime]


In [36]:
se_df = load_translation(G, SWEDISH_DESCRIPTION_PATH, SWEDISH_REFSET_PATH)
print(se_df.shape[0])
se_df.sample(3)

346723


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
123001009,[komplementfaktor C3 nefritfaktor]
723478003,[Oceanobacillus neutriphilus]
782598008,[allergi mot trifenylmetantriisocyanat]


In [37]:
kr_df = load_translation(G, KOREAN_DESCRIPTION_PATH, KOREAN_REFSET_PATH)
print(kr_df.shape[0])
kr_df.sample(3)

23552


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
48724000,[승모판 폐쇄부전]
195168007,[뇌실내 출혈을 동반한 뇌내 출혈]
19138001,[사마귀 표피 형성 이상]


In [38]:
translations = {
    "Korean": kr_df, 
    "Dutch": nl_df,
    "Swedish": se_df,
    "Estonian": ee_df
}

In [None]:
def count_translations():
    translated_sets = {k: set(v.index.values) for k,v in translations.items()}
    seen = set()
    for c1 in tqdm(G.get_descendants(G.root_concept_id, 1)):        
        for c2 in G.get_descendants(c1.sctid, 1):
            for c3 in G.get_descendants(c2.sctid, 1):
                for c4 in G.get_descendants(c3.sctid, 1):
                    descendants = set([c.sctid for c in G.get_descendants(c4.sctid)])
                    # to stop us duplicating children (it's a multi-hierarchy, after all)
                    # this means the first concept we come across will get the children
                    descendants = descendants - seen    
                    seen = seen | descendants
                    for k,v in translated_sets.items():
                        yield {
                            "great grandparent": c1,
                            "grandparent": c2,
                            "parent": c3,
                            "concept": c4,
                            "num_children": len(descendants),
                            "language": k,
                            "translations": len(descendants.intersection(v))
                        }

df = pd.DataFrame(count_translations())

In [None]:
(
    df
    .pivot(
        index=["great grandparent", "grandparent", "parent", "concept", "num_children"], 
        columns="language", 
        values="translations"
    )
    .to_csv("analysis.csv")
)

In [None]:
important_attributes = {
    # 'Access (attribute)',
    # 'After (attribute)',
    'Associated finding (attribute)',
    'Associated morphology (attribute)',
    'Associated procedure (attribute)',
    'Associated with (attribute)',
    'Before (attribute)',
    'Causative agent (attribute)',
    'Characterizes (attribute)',
    # 'Clinical course (attribute)',
    'Component (attribute)',
    'Direct device (attribute)',
    'Direct morphology (attribute)',
    'Direct site (attribute)',
    'Direct substance (attribute)',
    'Due to (attribute)',
    'During (attribute)',
    # 'Finding context (attribute)',
    'Finding informer (attribute)',
    'Finding method (attribute)',
    'Finding site (attribute)',
    'Has absorbability (attribute)',
    'Has active ingredient (attribute)',
    'Has basic dose form (attribute)',
    'Has basis of strength substance (attribute)',
    'Has coating material (attribute)',
    'Has compositional material (attribute)',
    'Has concentration strength denominator unit (attribute)',
    'Has concentration strength numerator unit (attribute)',
    'Has device intended site (attribute)',
    'Has disposition (attribute)',
    'Has dose form administration method (attribute)',
    'Has dose form intended site (attribute)',
    'Has dose form release characteristic (attribute)',
    'Has dose form transformation (attribute)',
    'Has filling (attribute)',
    'Has focus (attribute)',
    'Has ingredient qualitative strength (attribute)',
    'Has intent (attribute)',
    # 'Has interpretation (attribute)',
    'Has manufactured dose form (attribute)',
    'Has precise active ingredient (attribute)',
    'Has presentation strength denominator unit (attribute)',
    'Has presentation strength numerator unit (attribute)',
    'Has realization (attribute)',
    'Has specimen (attribute)',
    'Has state of matter (attribute)',
    'Has surface texture (attribute)',
    'Has target population (attribute)',
    'Has unit of presentation (attribute)',
    'Indirect device (attribute)',
    'Indirect morphology (attribute)',
    'Inherent location (attribute)',
    'Inheres in (attribute)',
    'Interprets (attribute)',
    # 'Is a (attribute)',
    'Is modification of (attribute)',
    'Is sterile (attribute)',
    'Laterality (attribute)',
    'Measurement method (attribute)',
    'Method (attribute)',
    'Occurrence (attribute)',
    'Pathological process (attribute)',
    'Plays role (attribute)',
    'Precondition (attribute)',
    'Priority (attribute)',
    'Procedure context (attribute)',
    'Procedure device (attribute)',
    'Procedure morphology (attribute)',
    'Procedure site (attribute)',
    'Procedure site - Direct (attribute)',
    'Procedure site - Indirect (attribute)',
    'Process acts on (attribute)',
    'Process duration (attribute)',
    'Process extends to (attribute)',
    'Process output (attribute)',
    'Property (attribute)',
    'Recipient category (attribute)',
    'Relative to (attribute)',
    'Relative to part of (attribute)',
    'Revision status (attribute)',
    'Route of administration (attribute)',
    # 'Scale type (attribute)',
    # 'Severity (attribute)',
    'Specimen procedure (attribute)',
    'Specimen source identity (attribute)',
    'Specimen source morphology (attribute)',
    'Specimen source topography (attribute)',
    'Specimen substance (attribute)',
    # 'Subject relationship context (attribute)',
    'Surgical approach (attribute)',
    'Technique (attribute)',
    # 'Temporal context (attribute)',
    # 'Temporally related to (attribute)',
    # 'Time aspect (attribute)',
    # 'Units (attribute)',
    'Using access device (attribute)',
    'Using device (attribute)',
    'Using energy (attribute)',
    'Using substance (attribute)'
}


In [None]:
def calc_concept_tiers(langcodes, translations, G):

    all_concepts = set([c.sctid for c in G])
    
    translated_concepts = [
        set(translations[lc].index.tolist())
        for lc in langcodes
    ]

    # Distinct set of translated concepts present in all langs we are checking
    all_translations = reduce(lambda a,b: a & b, translated_concepts, all_concepts)  

    print("Calculating Tier 1 Concept Set")
    # Tier 1 concepts are concepts where all parents have also been translated
    tier_1_concepts = set([
        c for c in tqdm(all_translations)
        if all([
            p.sctid in all_translations
            for p in G.get_full_concept(c).parents
        ])
    ])

    print("Calculating Tier 2 Concept Set")
    # Tier 2 concepts are Tier 1 concepts where important defining attributes have also been translated
    tier_2_concepts = set([
        c for c in tqdm(tier_1_concepts)
        if all([
            r.tgt.sctid in all_translations
            for g in G.get_full_concept(c).inferred_relationship_groups
            for r in g.relationships            
        ]) 
        and len(G.get_full_concept(c).inferred_relationship_groups) > 0
    ])

    print(f"""
        Languages: {langcodes}
        All: {len(all_translations)}
        Tier 1: {len(tier_1_concepts)}
        Tier 2: {len(tier_2_concepts)}
    """)

    return all_translations, tier_1_concepts, tier_2_concepts

In [None]:
context_tiers = dict()
for lang in ["Swedish", "Korean", "Dutch", "Estonian"]:
    print(f"Calculating Concept Sets for {lang}")
    at, t1, t2 = calc_concept_tiers([lang], translations, G)
    context_tiers[lang] = {'all_translations': at, 'tier1': t1, 'tier2': t2}

In [None]:
def calc_depth_tiers(G):
    shallow_tier = set()
    mid_tier = set()
    deep_tier = set()
    for concept in tqdm(iter(G), total=len(G)):
        try:
            depth = len(G.path_to_root(concept.sctid))
        except TypeError:
            pass
        else:
            if 1 <= depth <= 4:
                shallow_tier.add(concept.sctid)
            elif 5 <= depth <= 7:
                mid_tier.add(concept.sctid)
            elif depth >= 8:
                deep_tier.add(concept.sctid)
    return shallow_tier, mid_tier, deep_tier

ts, tm, td = calc_depth_tiers(G)

In [None]:
def gen_sampling_df():
    def get_depth(sctid):
        if sctid in ts:
            return "Shallow"
        elif sctid in tm:
            return "Medium"
        elif sctid in td:
            return "Deep"
        else:
            return pd.NA
    def get_cxt_tier(sctid, lang):
        if sctid in context_tiers[lang]["tier2"]:
            return "Tier 2"
        elif sctid in context_tiers[lang]["tier1"]:
            return "Tier 1"
        elif sctid in context_tiers[lang]["all_translations"]:
            return "All Translations"
        else:
            return pd.NA
    for concept in tqdm(iter(G), total=len(G)):
        for lang in ["Swedish", "Korean", "Dutch", "Estonian"]:
            try:
                translated_synonyms = translations[lang].loc[concept.sctid].translations
            except KeyError:
                pass
            else:
                yield {
                    'sctid': concept.sctid,
                    'fsn': concept.fsn,
                    'hierarchy': concept.hierarchy,
                    'depth_tier': get_depth(concept.sctid),
                    'language': lang,
                    'context_tier': get_cxt_tier(concept.sctid, lang),
                    'reference_translations': translated_synonyms,
                }

df = pd.DataFrame(list(gen_sampling_df()))

In [None]:
summary_df = (
    df
    .groupby(["hierarchy", "depth_tier", "language", "context_tier"])
    .size()
    .rename("Number of Concepts")
    .reset_index()
    .sort_values(["language", "hierarchy", "depth_tier", "context_tier"])
)

summary_df.to_csv(SUMMARY_PATH, index=False)

In [None]:
df.to_csv(ALL_TRANSLATIONS_PATH, index=False)

In [None]:
hierarchies_to_sample = [
    "substance",
    "body structure",
    "finding",
    "disorder",
    "procedure",
    "morphologic abnormality"
]

In [None]:
def sample_group(grp, sample_size=100):
    sample_size = min(grp.shape[0], sample_size)
    sample = grp.sample(sample_size, replace=False)
    return sample[["sctid", "fsn", "translations"]]

sample_df = (
    df
    [df.hierarchy.isin(hierarchies_to_sample)]
    .dropna()
    .groupby(["hierarchy", "depth_tier", "language", "context_tier"])
    .apply(sample_group)
    .reset_index()
    .drop("level_4", axis="columns")
    .sort_values(["language", "hierarchy", "depth_tier", "context_tier"])
)

sample_df.to_csv(SAMPLE_PATH, index=False)

In [3]:
sample_df = pd.read_csv(SAMPLE_PATH)

In [25]:
sample_df

Unnamed: 0,sctid,language,hierarchy,depth_tier,context_tier,fsn,reference_translations
0,1217298007,Dutch,body structure,Deep,All Translations,Structure of soft tissue over ischial tuberosi...,['structuur van weke delen van tuber ischiadic...
1,68440005,Dutch,body structure,Deep,All Translations,Third metacarpal bone structure (body structure),['botstructuur van os metacarpi III']
2,244726006,Dutch,body structure,Deep,All Translations,Entire orbicularis oculi (body structure),['gehele musculus orbicularis oculi']
3,879995006,Dutch,body structure,Deep,All Translations,Structure of subcutaneous tissue of right butt...,['structuur van subcutaan weefsel van rechter ...
4,50939004,Dutch,body structure,Deep,All Translations,Fifth metacarpal bone structure (body structure),['structuur van os metacarpi V']
...,...,...,...,...,...,...,...
12635,372486003,Swedish,substance,Shallow,Tier 2,Substance with 5-hydroxytryptamine-3-receptor ...,['5-HT3-receptorantagonist']
12636,422751003,Swedish,substance,Shallow,Tier 2,Fusafungine (substance),['fusafungin']
12637,423259008,Swedish,substance,Shallow,Tier 2,Panitumumab (substance),['panitumumab']
12638,391836008,Swedish,substance,Shallow,Tier 2,Bephenium (substance),['befenium']
