# Overview

Prepare a subset of concepts for the translation PoC.

1. Load refsets for four languages: Korean, Dutch, Swedish and Estonian.
2. Calculate some stratification variables which we think will be important to translation: context tier, similarity tier, depth tier, length bucket.  These will be used to define "cells" of similar concepts.
3. Output a pivot table which we can use to browse the translated concepts in each extension.
4. Sample 25 (or as many as exist) concepts from each cell for the translation exercise.

# 1. Preparation

In [1]:
import pandas as pd
from functools import reduce
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from more_itertools import chunked
from scipy.sparse import lil_array

from snomed_graph.snomed_graph import *

In [5]:
########
# Inputs
########

# National Extensions
KOREAN_REFSET_PATH = "./data/snomed_extensions/snomed-MAIN_SNOMEDCT-KR-20240611/SnomedCT_Export/Delta/Refset/Language/der2_cRefset_Language21000267104Delta_KR_20240611.txt"
KOREAN_DESCRIPTION_PATH = "./data/snomed_extensions/snomed-MAIN_SNOMEDCT-KR-20240611/SnomedCT_Export/Delta/Terminology/sct2_Description_Delta_KR_20240611.txt"
SWEDISH_REFSET_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceSE_PRODUCTION_SE1000052_20240531T120000Z/Snapshot/Refset/Language/der2_cRefset_LanguageSnapshot-sv_SE1000052_20240531.txt"
SWEDISH_DESCRIPTION_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceSE_PRODUCTION_SE1000052_20240531T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-sv_SE1000052_20240531.txt"
DUTCH_REFSET_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceNL_PRODUCTION_NL1000146_20240331T120000Z/Snapshot/Refset/Language/der2_cRefset_LanguageSnapshot-nl_NL1000146_20240331.txt"
DUTCH_DESCRIPTION_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceNL_PRODUCTION_NL1000146_20240331T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-nl_NL1000146_20240331.txt"
ESTONIAN_REFSET_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceEE_PRODUCTION_EE1000181_20240530T120000Z/Snapshot/Refset/Language/der2_cRefset_LanguageSnapshot-et_EE1000181_20240530.txt"    
ESTONIAN_DESCRIPTION_PATH = "./data/snomed_extensions/SnomedCT_ManagedServiceEE_PRODUCTION_EE1000181_20240530T120000Z/Snapshot/Terminology/sct2_Description_Snapshot-et_EE1000181_20240530.txt"

# Serialized graph containing international edition
# Create this using snomed_graph/.
# See: https://github.com/VerataiLtd/snomed_graph
SNOMED_GRAPH_PATH = "./data/snomed_graph/full_concept_graph.gml"

#########
# Outputs
#########

# Where we store a pivot table of summarised results
SUMMARY_PATH = "data/pivot_table/available_translated_concepts_summary.csv"

# Single file containing all concepts and their available translations
ALL_TRANSLATIONS_PATH = "./data/prepared_translation_data/all_translations.csv"

# Sample of concepts to translate
SAMPLE_PATH = "./data/prepared_translation_data/samples.csv"

In [None]:
# We'll pull concepts to translate from the following hierarchies.

hierarchies_to_sample = [
    "substance",
    "body structure",
    "finding",
    "disorder",
    "procedure",
    "morphologic abnormality"
]

In [6]:
# We use the following list of attributes to build contextual "hints" for the generative translation model.

important_attributes = {
    # 'Access (attribute)',
    # 'After (attribute)',
    'Associated finding (attribute)',
    'Associated morphology (attribute)',
    'Associated procedure (attribute)',
    'Associated with (attribute)',
    'Before (attribute)',
    'Causative agent (attribute)',
    'Characterizes (attribute)',
    # 'Clinical course (attribute)',
    'Component (attribute)',
    'Direct device (attribute)',
    'Direct morphology (attribute)',
    'Direct site (attribute)',
    'Direct substance (attribute)',
    'Due to (attribute)',
    'During (attribute)',
    # 'Finding context (attribute)',
    'Finding informer (attribute)',
    'Finding method (attribute)',
    'Finding site (attribute)',
    'Has absorbability (attribute)',
    'Has active ingredient (attribute)',
    'Has basic dose form (attribute)',
    'Has basis of strength substance (attribute)',
    'Has coating material (attribute)',
    'Has compositional material (attribute)',
    'Has concentration strength denominator unit (attribute)',
    'Has concentration strength numerator unit (attribute)',
    'Has device intended site (attribute)',
    'Has disposition (attribute)',
    'Has dose form administration method (attribute)',
    'Has dose form intended site (attribute)',
    'Has dose form release characteristic (attribute)',
    'Has dose form transformation (attribute)',
    'Has filling (attribute)',
    'Has focus (attribute)',
    'Has ingredient qualitative strength (attribute)',
    'Has intent (attribute)',
    # 'Has interpretation (attribute)',
    'Has manufactured dose form (attribute)',
    'Has precise active ingredient (attribute)',
    'Has presentation strength denominator unit (attribute)',
    'Has presentation strength numerator unit (attribute)',
    'Has realization (attribute)',
    'Has specimen (attribute)',
    'Has state of matter (attribute)',
    'Has surface texture (attribute)',
    'Has target population (attribute)',
    'Has unit of presentation (attribute)',
    'Indirect device (attribute)',
    'Indirect morphology (attribute)',
    'Inherent location (attribute)',
    'Inheres in (attribute)',
    'Interprets (attribute)',
    # 'Is a (attribute)',
    'Is modification of (attribute)',
    'Is sterile (attribute)',
    'Laterality (attribute)',
    'Measurement method (attribute)',
    'Method (attribute)',
    'Occurrence (attribute)',
    'Pathological process (attribute)',
    'Plays role (attribute)',
    'Precondition (attribute)',
    'Priority (attribute)',
    'Procedure context (attribute)',
    'Procedure device (attribute)',
    'Procedure morphology (attribute)',
    'Procedure site (attribute)',
    'Procedure site - Direct (attribute)',
    'Procedure site - Indirect (attribute)',
    'Process acts on (attribute)',
    'Process duration (attribute)',
    'Process extends to (attribute)',
    'Process output (attribute)',
    'Property (attribute)',
    'Recipient category (attribute)',
    'Relative to (attribute)',
    'Relative to part of (attribute)',
    'Revision status (attribute)',
    'Route of administration (attribute)',
    # 'Scale type (attribute)',
    # 'Severity (attribute)',
    'Specimen procedure (attribute)',
    'Specimen source identity (attribute)',
    'Specimen source morphology (attribute)',
    'Specimen source topography (attribute)',
    'Specimen substance (attribute)',
    # 'Subject relationship context (attribute)',
    'Surgical approach (attribute)',
    'Technique (attribute)',
    # 'Temporal context (attribute)',
    # 'Temporally related to (attribute)',
    # 'Time aspect (attribute)',
    # 'Units (attribute)',
    'Using access device (attribute)',
    'Using device (attribute)',
    'Using energy (attribute)',
    'Using substance (attribute)'
}


# 2. Load data

In [7]:
G = SnomedGraph.from_serialized(SNOMED_GRAPH_PATH)

SNOMED graph has 361179 vertices and 1179749 edges


In [8]:
def load_translation(G, desc_path, lang_path):
    # Load the concept descriptions
    desc_df = pd.read_csv(desc_path, delimiter="\t", encoding='utf-8')
    # Load the language refset
    lang_df = pd.read_csv(lang_path, delimiter="\t", encoding='utf-8')
    # Filter the refset to Preferred Terms only
    lang_df = lang_df[lang_df.acceptabilityId == 900000000000548007]
    # IDs of all descriptors which are preferred terms
    preferred_term_descriptor_ids = lang_df.referencedComponentId.unique()
    # Filter descriptions to active concepts only
    desc_df = desc_df[desc_df.active == 1]
    # Filter to preferred terms
    desc_df = desc_df[desc_df.id.isin(preferred_term_descriptor_ids)]
    # Remove FSNs
    desc_df = desc_df[desc_df.typeId != 900000000000003001]
    # Some extensions include English terms.  We don't want these.
    desc_df = desc_df[desc_df.languageCode != "en"]
    # Remove concepts that don't exist in the International Edition
    desc_df = desc_df[[sctid in G for sctid in desc_df.conceptId]]
    desc_df = desc_df.rename(axis="columns", mapper={"conceptId": "sctid"})
    # One row per concept, with the synonyms aggregated into a list
    desc_df = desc_df.groupby("sctid").term.apply(list).rename("translations").to_frame()
    return desc_df

In [9]:
ee_df = load_translation(G, ESTONIAN_DESCRIPTION_PATH, ESTONIAN_REFSET_PATH)
print(ee_df.shape[0])
ee_df.sample(3)

16684


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
77126004,[Karunkul]
39877005,[Salmonella Lexington]
3381000181101,[Suurenenud geeniprodukti funktsioon]


In [10]:
nl_df = load_translation(G, DUTCH_DESCRIPTION_PATH, DUTCH_REFSET_PATH)
print(nl_df.shape[0])
nl_df.sample(3)

264396


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
227128004,[gerookte makreel]
1255250000,[restaureren van gebitselement met keramische ...
368734004,[geheel dorsum van apex linguae]


In [11]:
se_df = load_translation(G, SWEDISH_DESCRIPTION_PATH, SWEDISH_REFSET_PATH)
print(se_df.shape[0])
se_df.sample(3)

346723


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
715723008,"[syndaktyli, typ 1]"
50489001,[operation av telekantus]
708271002,[DNA från Gardnerella vaginalis]


In [12]:
kr_df = load_translation(G, KOREAN_DESCRIPTION_PATH, KOREAN_REFSET_PATH)
print(kr_df.shape[0])
kr_df.sample(3)

23552


Unnamed: 0_level_0,translations
sctid,Unnamed: 1_level_1
93201009,[골반내 림프절의 악성 비만 세포 종양]
1090141000119106,[치주 질환에 의한 부분 무치악 4급]
9287005,[콜로이드 낭]


In [13]:
translations = {
    "Korean": kr_df, 
    "Dutch": nl_df,
    "Swedish": se_df,
    "Estonian": ee_df
}

In [14]:
languages = list(translations.keys())

# 3. Derive stratification variables

## 3.1 Calculate Context Tiers

We have three tiers:
- "Tier 0" contains all concepts not in another tier
- "Tier 1" concepts have translations present for all their parent concepts.
- "Tier 2" concepts have translations present for all parent concepts and for all inferred attribute relationships.

In [None]:
def calc_context_tiers(langcode, translations, G):

    tier_0_concepts = set([c.sctid for c in G])
    all_translations = set(translations[langcode].index.tolist())

    print("Calculating Context Tier 1 Concept Set")
    # Tier 1 concepts are concepts where all parents have also been translated
    tier_1_concepts = set([
        c for c in tqdm(tier_0_concepts)
        if all([
            p.sctid in all_translations
            for p in G.get_full_concept(c).parents
        ])
    ])

    print("Calculating Context Tier 2 Concept Set")
    # Tier 2 concepts are Tier 1 concepts where important defining attributes have also been translated
    tier_2_concepts = set([
        c for c in tqdm(tier_1_concepts)
        if all([
            r.tgt.sctid in all_translations
            for g in G.get_full_concept(c).inferred_relationship_groups
            for r in g.relationships            
        ]) 
        and len(G.get_full_concept(c).inferred_relationship_groups) > 0
    ])

    tier_0_concepts = tier_0_concepts - tier_1_concepts - tier_2_concepts
    tier_1_concepts = tier_1_concepts - tier_2_concepts

    print(f"""
        Language: {langcode}
        Tier 0: {len(tier_0_concepts)}
        Tier 1: {len(tier_1_concepts)}
        Tier 2: {len(tier_2_concepts)}
    """)

    return tier_0_concepts, tier_1_concepts, tier_2_concepts

In [None]:
context_tiers = dict()

for lang in languages:
    print(f"Calculating Concept Sets for {lang}")
    at, t1, t2 = calc_context_tiers(lang, translations, G)
    context_tiers[lang] = {'tier0': at, 'tier1': t1, 'tier2': t2}

## 3.2 Calculate depth tiers

- Shallow is <= 4 to the root
- Medium is between 5 and 7 (inclusive)
- Deep is >= 8

In [None]:
def calc_depth_tiers(G):
    shallow_tier = set()
    mid_tier = set()
    deep_tier = set()
    for concept in tqdm(iter(G), total=len(G)):
        try:
            depth = len(G.path_to_root(concept.sctid))
        except TypeError:
            pass
        else:
            if 1 <= depth <= 4:
                shallow_tier.add(concept.sctid)
            elif 5 <= depth <= 7:
                mid_tier.add(concept.sctid)
            elif depth >= 8:
                deep_tier.add(concept.sctid)
    return shallow_tier, mid_tier, deep_tier

ts, tm, td = calc_depth_tiers(G)

## 3.3 Calculate Similarity tier

- Tier 0 concepts have no similar terms with translations
- Tier 1 concepts have a translation for at least one similar term

In [None]:
def calc_similarity_tiers(translations, G, min_score=2, chunksize=1000):
    similarity_tiers = dict()
    tier_0_concepts = [c.sctid for c in G]
    candidate_tier_1_concepts = dict()
    preferred_terms = [c.fsn.replace(f"({c.hierarchy})", "").strip() for c in G]
    vectorizer = CountVectorizer(lowercase=True, stop_words=None, ngram_range=(2,10), binary=True)
    key_matrix = vectorizer.fit_transform(preferred_terms)
    # We chunk the dense matmul operations to avoid blowing the memory out    
    print("Finding similar terms")    
    N_iter = int(len(tier_0_concepts) / chunksize)
    it = zip(chunked(tier_0_concepts, chunksize), chunked(preferred_terms, chunksize))
    for sctids, pt_chunk in tqdm(it, total=N_iter):
        queries = vectorizer.transform(pt_chunk)
        search = key_matrix.dot(queries.T).T.A
        similar = lil_array(search >= min_score)
        src_idx, tgt_idx = similar.nonzero()
        # Since lil entries are sorted (row, col) we can use a grouper to enable us to
        # perform a single dict update (more efficient)
        it2 = groupby(zip(src_idx, tgt_idx), key=lambda x: x[0])
        for src, grp in it2:
            src_sctid = sctids[src]
            tgt_sctids = [tier_0_concepts[tgt] for _, tgt in grp]
            candidate_tier_1_concepts[src_sctid] = tgt_sctids
    # Remove decendants and parents
    print("Filtering similar terms")
    for sctid in tqdm(candidate_tier_1_concepts.keys()):
        descendants = {c.sctid for c in G.get_descendants(sctid)}
        parents = {c.sctid for c in G.get_parents(sctid)}
        filtered = set(candidate_tier_1_concepts[sctid]) - {sctid} - descendants - parents
        candidate_tier_1_concepts[sctid] = filtered
    print("Filtering by language")
    for langcode, translations_df in tqdm(translations.items()):
        all_translations = set(translations_df.index.tolist())
        tier_1_concepts = {
            sctid 
            for sctid, others in candidate_tier_1_concepts.items()
            if others & all_translations != set()
        }
        tier_0_concepts = set(tier_0_concepts) - tier_1_concepts
        similarity_tiers[langcode] = {"tier0": tier_0_concepts, "tier1": tier_1_concepts}
        print(f"""
            Language: {langcode}
            Tier 0: {len(tier_0_concepts)}
            Tier 1: {len(tier_1_concepts)}
        """)    
    return similarity_tiers

In [None]:
similarity_tiers = calc_similarity_tiers(translations, G)

## 3.4 Build a dataframe containing all concepts and their stratification variables

Note each concept will appear once per language.

In [None]:
def generate_all_concepts_df():
    
    def get_concept_len_bucket(concept):
        preferred_term = concept.fsn.replace(f"({concept.hierarchy})", "").strip()
        if len(preferred_term) <= 20:
            return "Short"
        elif len(preferred_term) <= 30:
            return "Medium"
        else:
            return "Long"
            
    def get_depth(sctid):
        if sctid in ts:
            return "Shallow"
        elif sctid in tm:
            return "Medium"
        elif sctid in td:
            return "Deep"
        else:
            return pd.NA
            
    def get_cxt_tier(sctid, lang):
        if sctid in context_tiers[lang]["tier2"]:
            return "Tier 2"
        elif sctid in context_tiers[lang]["tier1"]:
            return "Tier 1"
        elif sctid in context_tiers[lang]["tier0"]:
            return "Tier 0"
        else:
            return pd.NA
            
    def get_sim_tier(sctid, lang):
        if sctid in similarity_tiers[lang]["tier1"]:
            return "Tier 1"
        else:
            return "Tier 0"
            
    for concept in tqdm(iter(G), total=len(G)):
        for lang in languages:
            try:
                translated_synonyms = translations[lang].loc[concept.sctid].translations
            except KeyError:
                translated_synonyms = pd.NA
            yield {
                'sctid': concept.sctid,
                'fsn': concept.fsn,
                'hierarchy': concept.hierarchy,
                'depth_tier': get_depth(concept.sctid),
                'language': lang,
                'context_tier': get_cxt_tier(concept.sctid, lang),
                'similarity_tier': get_sim_tier(concept.sctid, lang),
                'concept_length_bucket': get_concept_len_bucket(concept),
                'reference_translations': translated_synonyms,
            }

df = pd.DataFrame(list(generate_all_concepts_df()))

df["has_translation"] = df.reference_translations.apply(lambda x: True if isinstance(x, list) else False)

In [None]:
# Checkpoint the data at this point.
# We can re-load from here and avoid having to recompute all of the stratification variables.
df.to_csv(ALL_TRANSLATIONS_PATH, index=False)

In [None]:
# Save the pivot table
summary_df = (
    df
    .groupby(["hierarchy", "depth_tier", "language", "context_tier", "similarity_tier", "concept_length_bucket", "has_translation"])
    .size()
    .rename("Number of Concepts")
    .reset_index()
    .sort_values(["language", "hierarchy", "depth_tier", "context_tier", "similarity_tier", "concept_length_bucket", "has_translation"])
)

summary_df.to_csv(SUMMARY_PATH, index=False)

# 4. Sample the concepts to translate

We are looking for 25 in each cell.

In [None]:
def sample_group(grp, sample_size=25):
    sample_size = min(grp.shape[0], sample_size)
    sample = grp.sample(sample_size, replace=False)
    return sample[["sctid", "fsn", "reference_translations"]]

sample_df = (
    df
    [
        (df.hierarchy.isin(hierarchies_to_sample)) &
        (df.has_translation)
    ]
    .dropna()
    .groupby(["hierarchy", "depth_tier", "language", "context_tier", "similarity_tier", "concept_length_bucket"])
    .apply(sample_group)
    .reset_index()
    .drop("level_6", axis="columns")
    .sort_values(["language", "hierarchy", "depth_tier", "context_tier", "similarity_tier", "concept_length_bucket"])
)

sample_df.shape[0]

In [None]:
sample_df

In [None]:
sample_df.to_csv(SAMPLE_PATH, index=False)