In [14]:
import json
import pandas as pd

In [15]:
def flatten_ctakes(ctakes_extract):
    concepts = []
    for sublist in ctakes_extract["domains"]:
        for cc in sublist['concepts']:
            for kk,vv in cc.items():
                if vv=='na':
                    cc[kk] = None
                elif vv=='f':
                    cc[kk] = False
                elif vv=='t':
                    cc[kk] = True
            cc['domain'] = sublist['name']

        concepts.extend(sublist['concepts'])
    return concepts

In [16]:
from collections import Counter
from functools import reduce

def generate_ids(keys):
    raw_prefixes = {kk: kk[0].upper() for kk in keys}
    prefix_counter = Counter(raw_prefixes.values())
    prefixes = {kk:vv for kk,vv in raw_prefixes.items() if prefix_counter[vv]==1}
    if len(prefixes) == len(keys):
        return prefixes
    
    nonuniq_keys = [kk for kk,vv in raw_prefixes.items() if prefix_counter[vv]>1]
#     for ii in range(min([len(kk) for kk in nonuniq_keys])):
#         Counter([kk[ii] for kk in nonuniq_keys])
    letter_sets = [set(kk[1:].upper()) for kk in nonuniq_keys]
    nonuniq_letters = reduce(lambda x,y: x|y, letter_sets)
    common_letters = reduce(lambda x,y: x&y, letter_sets)
    uniq_letters = nonuniq_letters - common_letters
    
    all_letters = set((chr(65+ii) for ii in range(26)))
    for kk in nonuniq_keys:
        second_letter = list((set(kk[1:].upper()) & uniq_letters))
        if len(second_letter):
            second_letter = second_letter[0]
        else:
            second_letter = list(all_letters - common_letters)[0]
            common_letters.append(second_letter)
        pr = (kk[0] + second_letter).upper()

        prefixes[kk] = pr
    return prefixes

In [17]:
fn_jsonl = "./samples/extracts/texts/abdominal-pain-unc-part1_combined_output.json"
with open(fn_jsonl) as fh:
    ctakes_extract = json.load(fh)
    annotations = flatten_ctakes(ctakes_extract)

In [18]:
fn_txt = "./samples/texts/abdominal-pain-unc-part1.txt"
with open(fn_txt) as fh:
    text = fh.read()

In [19]:
# print(text)
# annotations

In [20]:
df_annotations = pd.DataFrame(annotations)

In [21]:
null_cols = df_annotations.isnull().all()
df_annotations = df_annotations.loc[:,~null_cols]

## Create a set of SNOMED CT IDs or CUIs

### List most frequent terms

note that some CUIs have several SNOMED_CT IDs ("vocab_term") linked with them

In [22]:
term_counts = df_annotations.groupby(["domain", "canon_text",  "vocab_term","cui"])["negated"].count().sort_values(ascending=False)
term_counts[(term_counts>1)]#.loc["diseases"]

domain              canon_text            vocab_term  cui     
diseases            Crohn Disease         34000006    C0010346    28
signs and symptoms  Obstruction           26036001    C0028778    14
                    Ischemia              52674009    C0022116    13
                    Abdominal Pain        21522001    C0000737    11
                    Pain                  22253000    C0030193    10
                                                                  ..
procedures          Complete Blood Count  26604007    C0009555     2
labs                Palpation             113011001   C0030247     2
                    Diagnostic Imaging    363679005   C0011923     2
                    colonoscopy           73761001    C0009378     2
medications         mercaptopurine        103         C0000618     2
Name: negated, Length: 89, dtype: int64

In [23]:
term_counts = df_annotations.groupby(["domain", "canon_text","cui"])["negated"].count().sort_values(ascending=False)
term_counts[(term_counts>2)].loc["diseases"]

canon_text                         cui     
Crohn Disease                      C0010346    28
Small bowel obstruction            C0235329     9
Hypertensive disease               C0020538     8
Mesenteric vascular insufficiency  C1412000     7
Deep Vein Thrombosis               C0149871     4
Obesity                            C0028754     3
Nicotine Dependence                C0028043     3
Posterior pituitary disease        C0751438     3
Name: negated, dtype: int64

In [24]:
snomeds = set(term_counts[(term_counts>1)].loc["diseases"].reset_index()["cui"].tolist())
snomeds

{'C0000768',
 'C0000833',
 'C0001363',
 'C0005586',
 'C0009319',
 'C0010346',
 'C0013146',
 'C0016169',
 'C0017168',
 'C0020538',
 'C0021843',
 'C0024117',
 'C0028043',
 'C0028754',
 'C0038436',
 'C0149871',
 'C0235329',
 'C0240111',
 'C0274281',
 'C0338908',
 'C0409959',
 'C0582430',
 'C0751438',
 'C1290884',
 'C1412000'}

##  Define classification options

In [25]:
df_annotations[df_annotations.cui.map(lambda x: x in snomeds)]

Unnamed: 0,location_snomed_id,vocab_term,negated,hof,range_text,offset_start,offset_end,vocab,canon_text,note_id,conditional,cui,location,domain
375,,34000006,False,False,Crohn's Disease,195,210,SNOMEDCT_US,Crohn Disease,abdominal-pain-unc-part1,False,C0010346,,diseases
376,,38341003,False,False,HTN,220,223,SNOMEDCT_US,Hypertensive disease,abdominal-pain-unc-part1,False,C0020538,,diseases
378,,34000006,False,False,Crohn's,1719,1726,SNOMEDCT_US,Crohn Disease,abdominal-pain-unc-part1,False,C0010346,,diseases
379,,34000006,False,False,Crohn's disease,1787,1802,SNOMEDCT_US,Crohn Disease,abdominal-pain-unc-part1,False,C0010346,,diseases
380,,34000006,False,False,Crohn's,2008,2015,SNOMEDCT_US,Crohn Disease,abdominal-pain-unc-part1,False,C0010346,,diseases
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,,38341003,False,False,hypertension,19083,19095,SNOMEDCT_US,Hypertensive disease,abdominal-pain-unc-part1,False,C0020538,,diseases
539,,38341003,False,False,hypertension,19160,19172,SNOMEDCT_US,Hypertensive disease,abdominal-pain-unc-part1,False,C0020538,,diseases
541,,56294008,False,False,Nicotine dependence,19475,19494,SNOMEDCT_US,Nicotine Dependence,abdominal-pain-unc-part1,False,C0028043,,diseases
542,,34000006,False,False,Crohn?s,19604,19611,SNOMEDCT_US,Crohn Disease,abdominal-pain-unc-part1,False,C0010346,,diseases


In [26]:
# df_annotations[df_annotations.label=="disease"].groupby(["text", "cui"]).count()['label'].sort_values()[::-1][:25]

In [27]:
# df_annotations.drop_duplicates().set_index('label').sort_index()['text'].groupby(level=0).agg(list).loc["disease"]

In [28]:
# set([ann['label'] for ann in annotations])

In [29]:
cui_to_text = term_counts.reset_index().set_index('cui')['canon_text']

In [30]:
options_cui = {'gi_issues': ['C0010346', 'C0235329', 'C1412000', None],
 'cardiovascular':["C0149871", "C0022116", "C0000737", "C0020538", None],
}

In [31]:
options_terms = {kk:[cui_to_text.to_dict().get(x, "other") for x in vv] for kk,vv in options_cui.items()}

In [32]:
options_terms

{'gi_issues': ['Crohn Disease',
  'Small bowel obstruction',
  'Mesenteric vascular insufficiency',
  'other'],
 'cardiovascular': ['Deep Vein Thrombosis',
  'Ischemia',
  'Abdominal Pain',
  'Hypertensive disease',
  'other']}

In [33]:
def generate_dtd(task, options, prefixes=None):
    if prefixes is None:
        prefixes = generate_ids(options.keys())
        
    elements = f"""<!ENTITY name "{task}">

    <!--
      ~ MAE - Multi-purpose Annotation Environment
      ~ For feedback, reporting bugs, use the project on Github
      ~ @see <a href="https://github.com/keighrim/mae-annotation">https://github.com/keighrim/mae-annotation</a>.
      -->
      
"""

    for label, opts in options.items():
        element = ("<!-- #PCDATA makes a extent tag-->\n"
            f"<!ELEMENT {label} ( #PCDATA ) >\n"
            "<!-- this can be non-consuming-->\n"
            f"<!ATTLIST {label} spans #IMPLIED >\n"
        )
        element += f'<!ATTLIST {label} id ID prefix="{prefixes[label]}" #REQUIRED >\n'
        if len(opts):
            implied = "other" if ("other" in opts) else opts[0]
            # sub-term / leafs of the ontology tree
            opts_flat = " | ".join(opts)
            element += f'<!ATTLIST {label} type ( {opts_flat } ) #IMPLIED "{implied}" >\n'

        element += '\n'*2
        elements += element
    return elements.strip()

In [34]:
task = "gi_and_cardiovascular"
prefixes = generate_ids(options_terms.keys())
dtd_text = generate_dtd(task, options_terms, prefixes = prefixes )

In [22]:
print(dtd_text)

<!ENTITY name "extracts_molecular_surgical_ctakes">

    <!--
      ~ MAE - Multi-purpose Annotation Environment
      ~ For feedback, reporting bugs, use the project on Github
      ~ @see <a href="https://github.com/keighrim/mae-annotation">https://github.com/keighrim/mae-annotation</a>.
      -->
      
<!-- #PCDATA makes a extent tag-->
<!ELEMENT gi_issues ( #PCDATA ) >
<!-- this can be non-consuming-->
<!ATTLIST gi_issues spans #IMPLIED >
<!ATTLIST gi_issues id ID prefix="G" #REQUIRED >
<!ATTLIST gi_issues type ( Crohn Disease | Small bowel obstruction | Mesenteric vascular insufficiency | other ) #IMPLIED "other" >


<!-- #PCDATA makes a extent tag-->
<!ELEMENT cardiovascular ( #PCDATA ) >
<!-- this can be non-consuming-->
<!ATTLIST cardiovascular spans #IMPLIED >
<!ATTLIST cardiovascular id ID prefix="C" #REQUIRED >
<!ATTLIST cardiovascular type ( Deep Vein Thrombosis | Ischemia | Abdominal Pain | Hypertensive disease | other ) #IMPLIED "other" >


In [35]:
output_dir = "data/xml_test"
os.makedirs(output_dir, exist_ok=True)

fn_dtd = os.path.join(output_dir, task + '.dtd')
with open(fn_dtd, 'w') as fh:
    fh.write(dtd_text)

In [36]:
def generate_xml(report, annotations, prefixes,
                 label = "label",
                 start = "offset_start",
                 end = "offset_end",
                 task = "extracts_molecular_surgical_ctakes"):
    annotations_xml = ""
    counts = {}
    for ann in annotations:
        if ann[label] not in counts:
            counts[ann[label]] = 0
        else:
            counts[ann[label]] += 1
        text_ = report[ann[start]:ann[end]]
        id_ = prefixes[ann[label]] + str(counts[ann[label]])
        annotations_xml+= (f'<{ann[label]} id="{id_}" spans="{ann[start]+1}~{ann[end]+1}" text="{text_}" />\n')
    result = (f"""<?xml version="1.0" encoding="UTF-8" ?>\n<{task}>\n<TEXT><![CDATA["""
              + '\ufeff'
              + report + "]]></TEXT>\n" +
              f"""<TAGS>\n{annotations_xml}</TAGS>\n</{task}>""")
    return result

In [37]:
# prefixes
options_terms_flat = set(reduce(lambda x,y: list(x) + list(y), options_terms.values())) - {'other'}
# c
options_terms_flat

{'Abdominal Pain',
 'Crohn Disease',
 'Deep Vein Thrombosis',
 'Hypertensive disease',
 'Ischemia',
 'Mesenteric vascular insufficiency',
 'Small bowel obstruction'}

In [38]:
df_annotations_subset = df_annotations[df_annotations.canon_text.map(lambda x: x in options_terms_flat)]

In [38]:
df_annotations_subset

Unnamed: 0,location_snomed_id,vocab_term,negated,hof,range_text,offset_start,offset_end,vocab,canon_text,note_id,conditional,cui,location,domain
1,113345001,21522001,False,False,abdominal pain,96,110,SNOMEDCT_US,Abdominal Pain,abdominal-pain-unc-part1,False,C0000737,Abdomen,signs and symptoms
10,113345001,21522001,False,False,abdominal pain,750,764,SNOMEDCT_US,Abdominal Pain,abdominal-pain-unc-part1,False,C0000737,Abdomen,signs and symptoms
13,113345001,21522001,False,False,abdominal pain,889,903,SNOMEDCT_US,Abdominal Pain,abdominal-pain-unc-part1,False,C0000737,Abdomen,signs and symptoms
16,113345001,21522001,False,False,abdominal pain,1018,1032,SNOMEDCT_US,Abdominal Pain,abdominal-pain-unc-part1,False,C0000737,Abdomen,signs and symptoms
159,113345001,21522001,False,False,abdominal pain,10139,10153,SNOMEDCT_US,Abdominal Pain,abdominal-pain-unc-part1,False,C0000737,Abdomen,signs and symptoms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,,38341003,False,False,HTN,19063,19066,SNOMEDCT_US,Hypertensive disease,abdominal-pain-unc-part1,False,C0020538,,diseases
538,,38341003,False,False,hypertension,19083,19095,SNOMEDCT_US,Hypertensive disease,abdominal-pain-unc-part1,False,C0020538,,diseases
539,,38341003,False,False,hypertension,19160,19172,SNOMEDCT_US,Hypertensive disease,abdominal-pain-unc-part1,False,C0020538,,diseases
542,,34000006,False,False,Crohn?s,19604,19611,SNOMEDCT_US,Crohn Disease,abdominal-pain-unc-part1,False,C0010346,,diseases


In [40]:
prefixes

{'gi_issues': 'G', 'cardiovascular': 'C'}

In [40]:
text_w_annotation_xml = generate_xml(text,
                                     df_annotations_subset.to_dict(orient='records'), 
                                     prefixes,
            label = "canon_text")

KeyError: 'Abdominal Pain'

In [41]:
print(text_w_annotation_xml)

NameError: name 'text_w_annotation_xml' is not defined