In [1]:
from snomed_graph import *

In [2]:
# tur_inputs = tokenizer.encode("Translate to English: Aya cok dilli bir dil modelidir.", return_tensors="pt")
# tur_outputs = aya_model.generate(tur_inputs, max_new_tokens=128)
# print(tokenizer.decode(tur_outputs[0]))

In [2]:
G = SnomedGraph.from_serialized("../snomed_graph/full_concept_graph.gml")

SNOMED graph has 361179 vertices and 1179749 edges


In [3]:
def load_translation(G, path):
    df = pd.read_csv(path, delimiter="\t", encoding='utf-8')
    # Active concepts only
    df = df[df.active == 1]
    # Remove FSNs on the assumption that each concept will retain a preferred term without the hierarchy qualifier appended
    df = df[df.typeId != 900000000000003001]
    # Some extensions include English terms.  We don't want these.
    df = df[df.languageCode != "en"]
    # Remove concepts that don't exist in the International Edition
    df = df[[sctid in G for sctid in df.conceptId]]
    df = df.rename(axis="columns", mapper={"conceptId": "sctid"})
    # One row per concept, with the synonyms aggregated into a list
    df = df.groupby("sctid").term.apply(list).rename("translations").to_frame()
    return df

In [20]:
ee_df = load_translation(G, "./SnomedCT_ManagedServiceEE_PRODUCTION_EE1000181_20240530T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-et_EE1000181_20240530.txt")
print(ee_df.shape[0])
ee_df.sample(3)

16684


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
225076000,"[Nasogastraalsondi kinnituse asendamine, Nina-..."
700085004,[Paenibacillus woosongensis]
432913003,[Slackia sp]


In [21]:
nl_df = load_translation(G, "./SnomedCT_ManagedServiceNL_PRODUCTION_NL1000146_20240331T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-nl_NL1000146_20240331.txt")
print(nl_df.shape[0])
nl_df.sample(3)

264396


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
865908007,[onderbreking van ademhaling door vastzittend ...
465991000,"[manchet voor femurverlengstuk, manchet voor v..."
34478009,[mislukte poging tot abortus met verbruikscoag...


In [22]:
se_df = load_translation(G, "./SnomedCT_ManagedServiceSE_PRODUCTION_SE1000052_20240531T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-sv_SE1000052_20240531.txt")
print(se_df.shape[0])
se_df.sample(3)

346723


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
303848004,[specifik angiografiundersökning]
247021003,[Wernickes hemianopsipupill]
419997008,[MR-angiografi av koronarartärer]


In [23]:
kr_df = load_translation(G, "./snomed-MAIN_SNOMEDCT-KR-20240611/SnomedCT_Export/Delta/Terminology/sct2_Description_Delta_KR_20240611.txt")
print(kr_df.shape[0])
kr_df.sample(3)

23552


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
212216008,[T11 흉추 신경근 손상]
197084009,[공장 게실증]
1162449005,[신생아 독성 홍반]


In [24]:
translations = {
    "Korean": kr_df, 
    "Dutch": nl_df,
    "Swedish": se_df,
    "Estonian": ee_df
}

In [9]:
def count_translations():
    translated_sets = {k: set(v.index.values) for k,v in translations.items()}
    seen = set()
    for c1 in tqdm(G.get_descendants(G.root_concept_id, 1)):        
        for c2 in G.get_descendants(c1.sctid, 1):
            for c3 in G.get_descendants(c2.sctid, 1):
                for c4 in G.get_descendants(c3.sctid, 1):
                    descendants = set([c.sctid for c in G.get_descendants(c4.sctid)])
                    # to stop us duplicating children (it's a multi-hierarchy, after all)
                    # this means the first concept we come across will get the children
                    descendants = descendants - seen    
                    seen = seen | descendants
                    for k,v in translated_sets.items():
                        yield {
                            "great grandparent": c1,
                            "grandparent": c2,
                            "parent": c3,
                            "concept": c4,
                            "num_children": len(descendants),
                            "language": k,
                            "translations": len(descendants.intersection(v))
                        }

df = pd.DataFrame(count_translations())

  0%|          | 0/19 [00:00<?, ?it/s]

In [10]:
(
    df
    .pivot(
        index=["great grandparent", "grandparent", "parent", "concept", "num_children"], 
        columns="language", 
        values="translations"
    )
    .to_csv("analysis.csv")
)

In [14]:
important_attributes = {
    # 'Access (attribute)',
    # 'After (attribute)',
    'Associated finding (attribute)',
    'Associated morphology (attribute)',
    'Associated procedure (attribute)',
    'Associated with (attribute)',
    'Before (attribute)',
    'Causative agent (attribute)',
    'Characterizes (attribute)',
    # 'Clinical course (attribute)',
    'Component (attribute)',
    'Direct device (attribute)',
    'Direct morphology (attribute)',
    'Direct site (attribute)',
    'Direct substance (attribute)',
    'Due to (attribute)',
    'During (attribute)',
    # 'Finding context (attribute)',
    'Finding informer (attribute)',
    'Finding method (attribute)',
    'Finding site (attribute)',
    'Has absorbability (attribute)',
    'Has active ingredient (attribute)',
    'Has basic dose form (attribute)',
    'Has basis of strength substance (attribute)',
    'Has coating material (attribute)',
    'Has compositional material (attribute)',
    'Has concentration strength denominator unit (attribute)',
    'Has concentration strength numerator unit (attribute)',
    'Has device intended site (attribute)',
    'Has disposition (attribute)',
    'Has dose form administration method (attribute)',
    'Has dose form intended site (attribute)',
    'Has dose form release characteristic (attribute)',
    'Has dose form transformation (attribute)',
    'Has filling (attribute)',
    'Has focus (attribute)',
    'Has ingredient qualitative strength (attribute)',
    'Has intent (attribute)',
    # 'Has interpretation (attribute)',
    'Has manufactured dose form (attribute)',
    'Has precise active ingredient (attribute)',
    'Has presentation strength denominator unit (attribute)',
    'Has presentation strength numerator unit (attribute)',
    'Has realization (attribute)',
    'Has specimen (attribute)',
    'Has state of matter (attribute)',
    'Has surface texture (attribute)',
    'Has target population (attribute)',
    'Has unit of presentation (attribute)',
    'Indirect device (attribute)',
    'Indirect morphology (attribute)',
    'Inherent location (attribute)',
    'Inheres in (attribute)',
    'Interprets (attribute)',
    # 'Is a (attribute)',
    'Is modification of (attribute)',
    'Is sterile (attribute)',
    'Laterality (attribute)',
    'Measurement method (attribute)',
    'Method (attribute)',
    'Occurrence (attribute)',
    'Pathological process (attribute)',
    'Plays role (attribute)',
    'Precondition (attribute)',
    'Priority (attribute)',
    'Procedure context (attribute)',
    'Procedure device (attribute)',
    'Procedure morphology (attribute)',
    'Procedure site (attribute)',
    'Procedure site - Direct (attribute)',
    'Procedure site - Indirect (attribute)',
    'Process acts on (attribute)',
    'Process duration (attribute)',
    'Process extends to (attribute)',
    'Process output (attribute)',
    'Property (attribute)',
    'Recipient category (attribute)',
    'Relative to (attribute)',
    'Relative to part of (attribute)',
    'Revision status (attribute)',
    'Route of administration (attribute)',
    # 'Scale type (attribute)',
    # 'Severity (attribute)',
    'Specimen procedure (attribute)',
    'Specimen source identity (attribute)',
    'Specimen source morphology (attribute)',
    'Specimen source topography (attribute)',
    'Specimen substance (attribute)',
    # 'Subject relationship context (attribute)',
    'Surgical approach (attribute)',
    'Technique (attribute)',
    # 'Temporal context (attribute)',
    # 'Temporally related to (attribute)',
    # 'Time aspect (attribute)',
    # 'Units (attribute)',
    'Using access device (attribute)',
    'Using device (attribute)',
    'Using energy (attribute)',
    'Using substance (attribute)'
}


In [15]:
from functools import reduce

def calc_concept_tiers(langcodes, translations, G):

    all_concepts = set([c.sctid for c in G])
    
    translated_concepts = [
        set(translations[lc].index.tolist())
        for lc in langcodes
    ]

    # Distinct set of translated concepts present in all langs we are checking
    all_translations = reduce(lambda a,b: a & b, translated_concepts, all_concepts)  

    print("Calculating Tier 1 Concept Set")
    # Tier 1 concepts are concepts where all parents have also been translated
    tier_1_concepts = set([
        c for c in tqdm(all_translations)
        if all([
            p.sctid in all_translations
            for p in G.get_full_concept(c).parents
        ])
    ])

    print("Calculating Tier 2 Concept Set")
    # Tier 2 concepts are Tier 1 concepts where important defining attributes have also been translated
    tier_2_concepts = set([
        c for c in tqdm(tier_1_concepts)
        if all([
            r.tgt.sctid in all_translations
            for g in G.get_full_concept(c).inferred_relationship_groups
            for r in g.relationships            
        ]) 
        and len(G.get_full_concept(c).inferred_relationship_groups) > 0
    ])

    print(f"""
        Languages: {langcodes}
        All: {len(all_translations)}
        Tier 1: {len(tier_1_concepts)}
        Tier 2: {len(tier_2_concepts)}
    """)

    return all_translations, tier_1_concepts, tier_2_concepts

In [16]:
context_tiers = dict()
for lang in ["Swedish", "Korean", "Dutch", "Estonian"]:
    print(f"Calculating Concept Sets for {lang}")
    at, t1, t2 = calc_concept_tiers([lang], translations, G)
    context_tiers[lang] = {'all_translations': at, 'tier1': t1, 'tier2': t2}

  0%|          | 0/346723 [00:00<?, ?it/s]

  0%|          | 0/341510 [00:00<?, ?it/s]


        Languages: ['Swedish']
        All: 346723
        Tier 1: 341510
        Tier 2: 231762
    


  0%|          | 0/23552 [00:00<?, ?it/s]

  0%|          | 0/8295 [00:00<?, ?it/s]


        Languages: ['Korean']
        All: 23552
        Tier 1: 8295
        Tier 2: 639
    


  0%|          | 0/264396 [00:00<?, ?it/s]

  0%|          | 0/255745 [00:00<?, ?it/s]


        Languages: ['Dutch']
        All: 264396
        Tier 1: 255745
        Tier 2: 181119
    


  0%|          | 0/16684 [00:00<?, ?it/s]

  0%|          | 0/8740 [00:00<?, ?it/s]


        Languages: ['Estonian']
        All: 16684
        Tier 1: 8740
        Tier 2: 395
    


In [17]:
import seaborn as sns
import numpy as np

def calc_depth_tiers(G):
    shallow_tier = set()
    mid_tier = set()
    deep_tier = set()
    for concept in tqdm(iter(G), total=len(G)):
        try:
            depth = len(G.path_to_root(concept.sctid))
        except TypeError:
            pass
        else:
            if 1 <= depth <= 4:
                shallow_tier.add(concept.sctid)
            elif 5 <= depth <= 7:
                mid_tier.add(concept.sctid)
            elif depth >= 8:
                deep_tier.add(concept.sctid)
    return shallow_tier, mid_tier, deep_tier

ts, tm, td = calc_depth_tiers(G)

  0%|          | 0/361179 [00:00<?, ?it/s]

In [25]:
def gen_sampling_df():
    def get_depth(sctid):
        if sctid in ts:
            return "Shallow"
        elif sctid in tm:
            return "Medium"
        elif sctid in td:
            return "Deep"
        else:
            return pd.NA
    def get_cxt_tier(sctid, lang):
        if sctid in context_tiers[lang]["tier2"]:
            return "Tier 2"
        elif sctid in context_tiers[lang]["tier1"]:
            return "Tier 1"
        elif sctid in context_tiers[lang]["all_translations"]:
            return "All Translations"
        else:
            return pd.NA
    for concept in tqdm(iter(G), total=len(G)):
        for lang in ["Swedish", "Korean", "Dutch", "Estonian"]:
            try:
                translated_synonyms = translations[lang].loc[concept.sctid].translations
            except KeyError:
                pass
            else:
                yield {
                    'sctid': concept.sctid,
                    'fsn': concept.fsn,
                    'hierarchy': concept.hierarchy,
                    'depth_tier': get_depth(concept.sctid),
                    'language': lang,
                    'context_tier': get_cxt_tier(concept.sctid, lang),
                    'reference_translations': translated_synonyms,
                }

df = pd.DataFrame(list(gen_sampling_df()))

  0%|          | 0/361179 [00:00<?, ?it/s]

In [28]:
summary_df = (
    df
    .groupby(["hierarchy", "depth_tier", "language", "context_tier"])
    .size()
    .rename("Number of Concepts")
    .reset_index()
    .sort_values(["language", "hierarchy", "depth_tier", "context_tier"])
)

summary_df.to_csv("available_translated_concepts_summary.csv", index=False)

In [29]:
df.to_csv("sampling_df.csv", index=False)

In [13]:
df = pd.read_csv("sampling_df.csv")

In [32]:
hierarchies_to_sample = [
    "substance",
    "body structure",
    "finding",
    "disorder",
    "procedure",
    "morphologic abnormality"
]

In [45]:
def sample_group(grp, sample_size=100):
    sample_size = min(grp.shape[0], sample_size)
    sample = grp.sample(sample_size, replace=False)
    return sample[["sctid", "fsn", "translations"]]

sample_df = (
    df
    [df.hierarchy.isin(hierarchies_to_sample)]
    .dropna()
    .groupby(["hierarchy", "depth_tier", "language", "context_tier"])
    .apply(sample_group)
    .reset_index()
    .drop("level_4", axis="columns")
    .sort_values(["language", "hierarchy", "depth_tier", "context_tier"])
)

sample_df.to_csv("samples.csv", index=False)