<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Dataset processing
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Leaf Clinical Trials
  </div>


  <div style=" float:left; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  Jean-baptiste AUJOGUE -  Hybrid Intelligence
  </div> 


<a id="TOC"></a>

##### Table Of Content

1. [LCT Texts](#texts) <br>
2. [LCT Graph](#graph) <br>
3. [LCT Entities](#ents) <br>


#### Useful links

- The Leaf Clinical Trials Corpus: a new resource for query generation from clinical trial eligibility criteria, [paper](https://www.nature.com/articles/s41597-022-01521-0), [dataset](https://figshare.com/articles/dataset/lct_corpus_tar_gz/17209610), [github](https://github.com/uw-bionlp/clinical-trials-gov-data)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import re
import copy
import json
import tarfile

# data
import numpy as np
import pandas as pd
import networkx as nx

# text
from spacy.lang.en import English

# viz
import pyvis.network as nt

  from .autonotebook import tqdm as notebook_tqdm


#### Custom variables

In [3]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'datasets', 'lct')
path_to_src  = os.path.join(path_to_repo, 'src')
path_to_src

'C:\\Users\\jb\\Desktop\\NLP\\perso - Transformers for NLP\\src'

In [4]:
base_dataset_name  = 'lct_corpus.tar.gz'
final_dataset_name = 'lct-ner'

#### Custom imports

In [5]:
sys.path.insert(0, path_to_src)

In [6]:
# from nlptools.dataset.chia.io import load_texts_from_zipfile, load_entities_from_zipfile
from nlptools.dataset.chia.preprocessing import get_ner_entities, convert_to_bio

<a id="texts"></a>

# 1. LCT Texts

[Table of content](#TOC)

In [7]:
def load_texts_from_targzfile(targz_file):
    with tarfile.open(targz_file, "r:gz") as archive:
        text_files = [(f.name, f) for f in archive.getmembers() if f.name.endswith('.txt')]
        texts = []
        for name, member in text_files:
            f = archive.extractfile(member)
            if f is not None:
                _id = name.split('/')[-1][:-4]
                text = f.read().decode('utf-8').replace('⁄', '/')
                texts.append([_id, text])
            
    df_texts = pd.DataFrame(texts, columns = ['Id', 'Text'])
    return df_texts

In [8]:
folder = os.path.join(path_to_data, final_dataset_name)
if not os.path.isdir(folder):
    os.makedirs(folder)

In [9]:
df_texts = load_texts_from_targzfile(os.path.join(path_to_data, base_dataset_name))

In [10]:
df_texts.head()

Unnamed: 0,Id,Text
0,NCT03862404,"Inclusion Criteria:\n - Male and Female, bet..."
1,NCT03861858,Inclusion Criteria:\n - Age 13-17 years or 1...
2,NCT03869021,Inclusion Criteria:\n 1. Patients free from a...
3,NCT03864263,Inclusion Criteria:\n - The father or (and) ...
4,NCT03862612,Inclusion Criteria:\n - Male and Female part...


In [11]:
df_texts.shape

(1006, 2)

In [12]:
df_texts.to_csv(os.path.join(path_to_data, final_dataset_name, 'lct_texts.tsv'), sep = "\t", index = False)

In [133]:
# sanity check : no overlap with chia
df_chia = pd.read_csv(os.path.join(path_to_repo, 'datasets', 'chia', 'chia-ner', 'chia_texts.tsv'), sep = "\t")

chia_ids = set([t.split('_')[0] for t in df_chia.Id.tolist()])
lct_ids  = set(df_texts.Id.tolist())
print(
    len(chia_ids & lct_ids),
    len(lct_ids - chia_ids),
    len(chia_ids - lct_ids),
)

0 1006 1000


<a id="graph"></a>

# 2. LCT Graph

[Table of content](#TOC)

## 2.1 Load LCT dataset into graph

[Table of content](#TOC)

#### Remark

Entities are multi-spanned, and are here replaced by their full spans

In [13]:
def load_entity_line(l):
    ent = l.split('\t')
    ent_id = ent[0]
    ent_txt = ent[2]
    ent_cat = ent[1].replace(';', ' ').split(' ')[0]
    spans = tuple([int(v) for v in ent[1].replace(';', ' ').split(' ')[1:]])
    return [ent_id, ent_txt, ent_cat, min(spans), max(spans), spans]



def load_graph_from_targzfile(targz_file):
    with tarfile.open(targz_file, "r:gz") as archive:
        ann_files = [(f.name, f) for f in archive.getmembers() if f.name.endswith('.ann')]
        all_nodes = []
        all_edges = []
        for name, member in ann_files:
            f = archive.extractfile(member)
            if f is not None:
                _id = name.split('/')[-1][:-4]
                lines = [l.decode('utf-8').replace('⁄', '/').strip() for l in f.readlines()]

                # load node attributes
                a_attrs = [l.replace('\t', ' ').split(' ') for l in lines if l.startswith('A')]
                a_attrs = {'-'.join((_id, n[-2])) : n[-1] for n in a_attrs}

                # load nodes corresponding to textual entities
                t_nodes = [load_entity_line(l) for l in lines if l.startswith('T')]
                t_nodes = [{
                    'Node_id': '-'.join((_id, n[0])),
                    'Study_id': _id,
                    'Entity_id': n[0],
                    'Mention': n[1],
                    'Category': n[2],
                    'Start_char': n[3],
                    'End_char': n[4],
                    'Char_spans': n[5],
                    'Normalization': (a_attrs['-'.join((_id, n[0]))] if '-'.join((_id, n[0])) in a_attrs else ''),
                    } 
                    for n in t_nodes
                ]

                # load nodes corresponding to metadata 
                e_nodes = [l.replace('\t', ' ').split(' ') for l in lines if l.startswith('E')]
                e_nodes = [{
                    'Node_id': '-'.join((_id, n[0])),
                    'Study_id': _id,
                    'Entity_id': n[0],
                    'Mention': '',
                    'Category': '',
                    'Start_char': '',
                    'End_char': '',
                    'Char_spans': tuple(),
                    'Normalization': '',
                    } 
                    for n in e_nodes
                ]
                nodes = t_nodes + e_nodes
                nodes = [(n['Node_id'], n) for n in nodes]
                
                # load edges carrying relationships
                e_edges = [l.replace('\t', ' ').split(' ') for l in lines if l.startswith('E')]
                e_edges = [{
                    's': '-'.join((_id, es[0])),
                    't': '-'.join((_id, e.split(':')[1])),
                    'Edge_id': es[0],
                    'label': e.split(':')[0],
                    } 
                    for es in e_edges for e in es[1:] if (':' in e)
                ]
                
                # load edges carrying meta-relations
                r_edges = [l.replace('\t', ' ').split(' ')[:4] for l in lines if l.startswith('R')]
                r_edges = [{
                    's': '-'.join((_id, s.split(':')[1])),
                    't': '-'.join((_id, t.split(':')[1])),
                    'Edge_id': e_id,
                    'label': label,
                    } 
                    for e_id, label, s, t in r_edges
                ]
                edges = e_edges + r_edges
                edges = [(e['s'], e['t'], e) for e in edges]
                
                all_nodes += nodes
                all_edges += edges
            
    nx_graph = nx.MultiDiGraph()
    nx_graph.add_nodes_from(all_nodes)
    nx_graph.add_edges_from(all_edges)
    return nx_graph

In [14]:
nx_graph = load_graph_from_targzfile(os.path.join(path_to_data, base_dataset_name))

## 2.2 Visualize graph

[Table of content](#TOC)


In [15]:
def filter_by_study(nx_graph, study):
    nodes = [n for n in nx_graph.nodes() if nx_graph.nodes[n]['Study_id'] == study]
    return nx_graph.subgraph(nodes)


def convert_nx_to_nt_graph(nx_graph, node_colors = {}, edge_weights = {}, height = 1200, width = 1980, notebook = False):
    nodes = list(nx_graph.nodes(data = True))
    edges = [(s, t, 1 + (edge_weights[e['label']] if e['label'] in edge_weights else 0)) for s, t, e in nx_graph.edges(data = True)]
    color = [(node_colors[k] if k in node_colors else '#edc939') for k, n in nodes]
    
    nt_graph = nt.Network(directed = True, height = height, width = width, notebook = notebook)
    nt_graph.add_nodes(
        nodes = [k for k, n in nodes],
        size  = [10]*len(nodes),
        label = [n['Mention'] for k, n in nodes],
        title = [n['Category'] for k, n in nodes],
        color = color,
    )
    nt_graph.add_edges(edges)
    return nt_graph

In [77]:
study = 'NCT03931941'

In [78]:
print(df_texts[df_texts.Id == study].Text.tolist()[0])

Inclusion Criteria:
  1. ≥ 18 years old.
  2. Medical record documentation of either: a) recurrent CDI per the study definition, that includes at least one recurrence after a primary episode and has completed at least one round of standard-of-care oral antibiotic therapy, b) or has had at least two episodes of severe CDI resulting in hospitalization.
  3. A positive stool test for the presence of toxigenic C. difficile or C. difficile toxin within 30 days prior to or on the date of enrollment.
  4. Is currently taking or was just prescribed antibiotics to control CDI related diarrhea at the time of enrollment.
[Note: Subject's CDI diarrhea must be controlled (<3 unformed/loose stools/day) while
taking antibiotics during screening.]
Exclusion Criteria:
  1. Has continued CDI diarrhea despite being on a course of antibiotics prescribed for CDI treatment.
  2. Requires systemic antibiotic therapy for a condition other than CDI.
  3. Fecal microbiota transplant (FMT) within the past 6 mont

In [79]:
nx_subgraph = filter_by_study(nx_graph, study = study)
nt_subgraph = convert_nx_to_nt_graph(
    nx_subgraph, 
    node_colors = {}, 
    edge_weights = {'Temporality': 5, },
    height = 600, 
    width = 900, 
    notebook = True,
)
nt_subgraph.save_graph(os.path.join('lct', 'misc', 'lct.html'))

nt_subgraph.show(os.path.join('lct', 'misc', 'lct.html'))

In [82]:
[e for s, t, e in nx_subgraph.edges(data = True) if e['s'] == 'NCT03931941-E20']

[{'s': 'NCT03931941-E20',
  't': 'NCT03931941-T74',
  'Edge_id': 'E20',
  'label': 'Observation'},
 {'s': 'NCT03931941-E20',
  't': 'NCT03931941-T78',
  'Edge_id': 'E20',
  'label': 'Name'},
 {'s': 'NCT03931941-E20',
  't': 'NCT03931941-E42',
  'Edge_id': 'E20',
  'label': 'Numeric-Filter'},
 {'s': 'NCT03931941-E20',
  't': 'NCT03931941-E72',
  'Edge_id': 'R21',
  'label': 'During'}]

<a id="ents"></a>

# 3. LCT Entities

[Table of content](#TOC)

## 3.1 Load entities as single spans

[Table of content](#TOC)


#### Remark

Entities are multi-spanned, and are here replaced by their full spans

In [19]:
def load_entities_from_targzfile(targz_file):
    with tarfile.open(targz_file, "r:gz") as archive:
        ann_files = [(f.name, f) for f in archive.getmembers() if f.name.endswith('.ann')]
        ent_list = []
        for name, member in ann_files:
            f = archive.extractfile(member)
            if f is not None:
                _id = name.split('/')[-1][:-4]
                lines = [l.decode('utf-8').replace('⁄', '/').strip() for l in f.readlines()]
                ents = [[_id] + load_entity_line(l) for l in lines if l.startswith('T')]
                ent_list += ents
            
    df_ents = pd.DataFrame(ent_list, columns = [
        'Id', 'Entity_id', 'Mention', 'Category', 'Start_char', 'End_char', 'Char_spans',
    ])
    return df_ents

In [70]:
df_ents = load_entities_from_targzfile(os.path.join(path_to_data, base_dataset_name))
df_ents.shape

(56093, 7)

In [21]:
df_ents.head()

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans
0,NCT03865342,T1,diabetes,Condition,387,395,"(387, 395)"
1,NCT03865342,T2,diabetes,Condition-Name,387,395,"(387, 395)"
2,NCT03865342,T3,English,Language,25,32,"(25, 32)"
3,NCT03865342,T4,given birth,Condition,661,672,"(661, 672)"
4,NCT03865342,T5,given birth,Condition-Name,661,672,"(661, 672)"


In [22]:
# A sanity check:
check = []
for _id, text in df_texts.values.tolist():
    df_tmp = df_ents[df_ents.Id == _id]
    check_tmp = df_tmp.apply(
        func = lambda r: [_id, r.Mention, text[r.Start_char: r.End_char]],
        axis = 1,
    ).tolist()
    check += check_tmp
    
df_check = pd.DataFrame(check, columns = ['Id', 'Mention', 'Span'])

# Mention and Span are indeed the same for all entities:
df_check[df_check.Mention != df_check.Span]

Unnamed: 0,Id,Mention,Span


## 3.2 Map "Eq-Comparison" to  Value / Temporal

[Table of content](#TOC)

In [101]:
def get_ancestor_path(nx_graph, node, return_ancestor_level = 2):
    ancestors = nx.ancestors(nx_graph, node)
    if ancestors:
        paths = [nx.shortest_path(nx_graph, source = a, target = node) for a in ancestors]
        path = tuple(sorted(paths, key = len)[-1])
        root = path[-return_ancestor_level:][0]
        path = [
            nx_graph.get_edge_data(m, n)[0]['label']
            for m, n in zip(path[:-1], path[1:])
        ]
        path = [root, path]
    else:
        path = [node, tuple([])]
    return path



def get_descendant_paths(nx_graph, node, from_ancestor_level = 1):
    if from_ancestor_level == 0:
        root = node
    else:
        ancestors = nx.ancestors(nx_graph, node)
        if ancestors:
            paths = [nx.shortest_path(nx_graph, source = a, target = node) for a in ancestors]
            path = tuple(sorted(paths, key = len)[-1])
            root = path[-from_ancestor_level:][0]
        else:
            root = node
    return list(set([nx_graph.nodes[n]['Category'] for n in nx.descendants(nx_graph, root)]))



def get_children_edges(nx_graph, node):
    return [e['label'] for s, t, e in nx_graph.out_edges(node, data = True)]
    


def get_overlaping_entities(row, df, output_column = 'index'):
    # get entities on same Id
    df = df[df.Id == row.Id]
    df = df[df.Entity_id != row.Entity_id]
    
    lengths = df.End_char - df.Start_char
    length = row.End_char - row.Start_char
    
    idx_short = []
    idx_equal = []
    idx_large = []

    # get shorter overlaping entities
    df_short = df[lengths <= length]
    if not df_short.empty:
        short = df_short.apply(
            func = lambda r: len(set(range(r.Start_char, r.End_char)) & set(range(row.Start_char, row.End_char)))>0, 
            axis = 1,
        )
        idx_short = df_short[short][output_column].tolist()
    
    # get identical overlaping entities
    df_equal = df[lengths == length]
    if not df_equal.empty:
        equal = df_equal.apply(
            func = lambda r: (r.Start_char == row.Start_char) & (r.End_char == row.End_char), 
            axis = 1,
        )
        idx_equal = df_equal[equal][output_column].tolist()

    # get larger overlaping entities
    df_large = df[lengths > length]
    if not df_large.empty:
        large = df_large.apply(
            func = lambda r: len(set(range(r.Start_char, r.End_char)) & set(range(row.Start_char, row.End_char)))>0, 
            axis = 1,
        )
        idx_large = df_large[large][output_column].tolist()
    return (idx_short, idx_equal, idx_large)

In [24]:
df_comps = df_ents[df_ents.Category == 'Eq-Comparison']
df_comps

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans
26,NCT03865342,T29,>5 lbs,Eq-Comparison,292,298,"(292, 298)"
27,NCT03865342,T30,within the past 3 months,Eq-Comparison,673,697,"(673, 697)"
29,NCT03865342,T32,18 years of age and older,Eq-Comparison,47,72,"(47, 72)"
35,NCT03865342,T41,5.7 - 6.4,Eq-Comparison,137,146,"(137, 146)"
36,NCT03865342,T42,within the 3 months,Eq-Comparison,147,166,"(147, 166)"
...,...,...,...,...,...,...,...
56058,NCT03931772,T29,≥ 5 out of 10,Eq-Comparison,150,163,"(150, 163)"
56062,NCT03931772,T73,current,Eq-Comparison,671,678,"(671, 678)"
56066,NCT03931772,T77,Currently taking,Eq-Comparison,912,928,"(912, 928)"
56079,NCT03931772,T87,daily,Eq-Comparison,302,307,"(302, 307)"


In [25]:
root_ancestor = df_comps.apply(
    func = lambda r: get_ancestor_path(nx_graph, '-'.join([r.Id, r.Entity_id])),
    axis = 1,
).tolist()

df_comps['Root'] = [r for r, a in root_ancestor]
df_comps['Ancestors'] = [a for r, a in root_ancestor]
df_comps['Root_edges'] = df_comps.Root.apply(lambda r: get_children_edges(nx_graph, r))
df_comps['Overlaping'] = df_comps.apply(
    func = lambda r: get_overlaping_entities(r, df_ents, output_column = 'Category'),
    axis = 1,
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comps['Root'] = [r for r, a in root_ancestor]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comps['Ancestors'] = [a for r, a in root_ancestor]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_comps['Root_edges'] = df_comps.Root.apply(lambda r: get_children_edges(nx_graph, r))
A value is try

In [26]:
df_comps.head(2)

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans,Root,Ancestors,Root_edges,Overlaping
26,NCT03865342,T29,>5 lbs,Eq-Comparison,292,298,"(292, 298)",NCT03865342-E6,"[Numeric-Filter, Eq-Comparison]","[Eq-Comparison, Operator, Unit, Value]","([Eq-Operator, Eq-Unit, Eq-Value], [], [])"
27,NCT03865342,T30,within the past 3 months,Eq-Comparison,673,697,"(673, 697)",NCT03865342-E7,"[Temporality, Eq-Comparison]","[Eq-Comparison, Operator, Temporal-Unit, Value...","([Eq-Operator, Eq-Temporal-Unit, Eq-Value, Eq-..."


In [31]:
other_filter = df_comps.Overlaping.apply(lambda os: len(set(os[1]) & set([
    'Eq-Temporal-Recency', 'Eq-Temporal-Unit', 'Severity', 'Eq-Operator',
])) > 0)

print(sum(other_filter))


temporal_cats = ['Temporality', 'Temporality2', 'Temporality3', 'During', 'Duration', 'Duration2'] # Before, After

temporal_filter = ~other_filter & (
    df_comps.Overlaping.apply(lambda os: (os[1] == ['Eq-Temporal-Period'])) |
    df_comps.Ancestors.apply(lambda a: (len(a)>1 and a[-2] in temporal_cats)) 
)

print(sum(temporal_filter))

value_cats = ['Numeric-Filter', 'Stage', 'Dose', 'Dose2']

value_filter = ~(other_filter | temporal_filter) & (
    df_comps.Ancestors.apply(lambda a: (len(a)>1 and a[-2] in value_cats))
)

print(sum(value_filter))


temporal_filter_2 = ~(other_filter | temporal_filter | value_filter) & (
    df_comps.Overlaping.apply(lambda os: ('Eq-Temporal-Unit' in os[0]))
)

print(sum(temporal_filter_2))


value_filter_2 = ~(other_filter | temporal_filter | value_filter | temporal_filter_2) & (
    df_comps.Overlaping.apply(lambda os: ('Eq-Value' in os[0]))
)

print(sum(value_filter_2))

other_filter_2 = ~(other_filter | temporal_filter | value_filter | temporal_filter_2 | value_filter_2)

print(sum(other_filter_2))

179
2627
2001
142
310
38


In [33]:
temporal_idx = df_comps[temporal_filter | temporal_filter_2].index.tolist()
len(temporal_idx)

2769

In [34]:
value_idx = df_comps[value_filter | value_filter_2].index.tolist()
len(value_idx)

2311

In [35]:
# sanity check: these sets of indices are disjoint
len(set(value_idx) & set(temporal_idx))

0

In [108]:
df_ents.loc[temporal_idx, 'Category'] = 'Temporal'
df_ents.loc[value_idx, 'Category'] = 'Value'

## 2.3 Map "Observation" to Observation / Laboratory / Value

[Table of content](#TOC)

In [109]:
df_obs = df_ents[df_ents.Category == 'Observation']
df_obs

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans
32,NCT03865342,T38,Hemoglobin A1C,Observation,121,135,"(121, 135)"
68,NCT03865342,T48,weight loss,Observation,260,271,"(260, 271)"
112,NCT03861130,T44,fever,Observation,337,342,"(337, 342)"
129,NCT03861130,T62,fever,Observation,959,964,"(959, 964)"
133,NCT03861130,T66,echocardiographic evidence,Observation,1028,1054,"(1028, 1054)"
...,...,...,...,...,...,...,...
55967,NCT03931941,T124,adverse event,Observation,1040,1053,"(1040, 1053)"
56026,NCT03931772,T25,smoking,Observation,338,345,"(338, 345)"
56046,NCT03931772,T40,smoker,Observation,308,314,"(308, 314)"
56057,NCT03931772,T61,quit smoking,Observation,99,111,"(99, 111)"


In [110]:
root_ancestor = df_obs.apply(
    func = lambda r: get_ancestor_path(nx_graph, '-'.join([r.Id, r.Entity_id])),
    axis = 1,
).tolist()

df_obs['Root'] = [r for r, a in root_ancestor]
df_obs['Ancestors'] = [a for r, a in root_ancestor]
df_obs['Root_edges'] = df_obs.Root.apply(lambda r: get_children_edges(nx_graph, r))
df_obs['Overlaping'] = df_obs.apply(
    func = lambda r: get_overlaping_entities(r, df_ents, output_column = 'Category'),
    axis = 1,
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_obs['Root'] = [r for r, a in root_ancestor]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_obs['Ancestors'] = [a for r, a in root_ancestor]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_obs['Root_edges'] = df_obs.Root.apply(lambda r: get_children_edges(nx_graph, r))
A value is trying to b

In [111]:
df_obs['Descendants'] = df_obs.Root.apply(lambda r: get_descendant_paths(nx_graph, r))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_obs['Descendants'] = df_obs.Root.apply(lambda r: get_descendant_paths(nx_graph, r))


In [112]:
df_obs.head(2)

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans,Root,Ancestors,Root_edges,Overlaping,Descendants
32,NCT03865342,T38,Hemoglobin A1C,Observation,121,135,"(121, 135)",NCT03865342-E10,[Observation],"[Observation, Name, Numeric-Filter, Temporality]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Tempo..."
68,NCT03865342,T48,weight loss,Observation,260,271,"(260, 271)",NCT03865342-E16,[Observation],"[Observation, Numeric-Filter, Temporality, Nam...","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Temporal-Period, Eq-Compari..."


In [113]:
df_obs[df_obs.Root_edges.apply(lambda l: 'Numeric-Filter' in l)]

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans,Root,Ancestors,Root_edges,Overlaping,Descendants
32,NCT03865342,T38,Hemoglobin A1C,Observation,121,135,"(121, 135)",NCT03865342-E10,[Observation],"[Observation, Name, Numeric-Filter, Temporality]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Tempo..."
68,NCT03865342,T48,weight loss,Observation,260,271,"(260, 271)",NCT03865342-E16,[Observation],"[Observation, Numeric-Filter, Temporality, Nam...","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Temporal-Period, Eq-Compari..."
155,NCT03861130,T85,diameter,Observation,816,824,"(816, 824)",NCT03861130-E50,"[Found-By, Observation]","[Observation, Name, Numeric-Filter]","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Comparison, Eq-Unit, Observ..."
177,NCT03867227,T18,CHA2DS2VASc,Observation,95,106,"(95, 106)",NCT03867227-E9,[Observation],"[Observation, Name, Numeric-Filter, Numeric-Fi...","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Comparison, Observation-Nam..."
292,NCT03865381,T26,A1c,Observation,157,160,"(157, 160)",NCT03865381-E7,[Observation],"[Observation, Name, Numeric-Filter, Numeric-Fi...","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Comparison, Eq-Unit, Observ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
55853,NCT03930134,T34,weight,Observation,243,249,"(243, 249)",NCT03930134-E20,"[And, Observation]","[Observation, Name, Numeric-Filter]","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Comparison, Eq-Unit, Observ..."
55937,NCT03931941,T74,absolute neutrophil count,Observation,1225,1250,"(1225, 1250)",NCT03931941-E20,[Observation],"[Observation, Name, Numeric-Filter, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Unit,..."
55940,NCT03931941,T80,CD4 count,Observation,1180,1189,"(1180, 1189)",NCT03931941-E26,[Observation],"[Observation, Name, Numeric-Filter, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Unit,..."
56026,NCT03931772,T25,smoking,Observation,338,345,"(338, 345)",NCT03931772-E7,[Observation],"[Observation, Numeric-Filter, Name]","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Comparison, Eq-Temporal-Uni..."


In [118]:
df_obs[df_obs.Descendants.apply(lambda ds: 'Study' in ds)]

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans,Root,Ancestors,Root_edges,Overlaping,Descendants
32,NCT03865342,T38,Hemoglobin A1C,Observation,121,135,"(121, 135)",NCT03865342-E10,[Observation],"[Observation, Name, Numeric-Filter, Temporality]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Tempo..."
68,NCT03865342,T48,weight loss,Observation,260,271,"(260, 271)",NCT03865342-E16,[Observation],"[Observation, Numeric-Filter, Temporality, Nam...","([Observation-Name], [Observation-Name], [])","[, Observation, Eq-Temporal-Period, Eq-Compari..."
482,NCT03867019,T64,pain,Observation,535,539,"(535, 539)",NCT03867019-E10,[Observation],"[Observation, Name, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Observation-Name]"
4098,NCT03865953,T42,pain,Observation,295,299,"(295, 299)",NCT03865953-E19,"[Or, Caused-By, And, Observation]","[Observation, Name, Duration, And]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Tempo..."
4109,NCT03865953,T56,glycated haemoglobin level,Observation,448,474,"(448, 474)",NCT03865953-E29,"[Abbrev-Of, Observation]","[Observation, Name, Equivalent-To]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Tempo..."
...,...,...,...,...,...,...,...,...,...,...,...,...
54401,NCT03866720,T72,diet,Observation,344,348,"(344, 348)",NCT03866720-E42,"[Asserted, Observation]","[Observation, Name, Example-Of]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Stability, Observation-..."
55476,NCT03930121,T62,computerized naming task,Observation,300,324,"(300, 324)",NCT03930121-E17,[Observation],"[Observation, Name, Numeric-Filter, Equivalent...","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Conditio..."
55937,NCT03931941,T74,absolute neutrophil count,Observation,1225,1250,"(1225, 1250)",NCT03931941-E20,[Observation],"[Observation, Name, Numeric-Filter, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Unit,..."
55940,NCT03931941,T80,CD4 count,Observation,1180,1189,"(1180, 1189)",NCT03931941-E26,[Observation],"[Observation, Name, Numeric-Filter, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Unit,..."


In [105]:
df_obs[df_obs.Id == 'NCT03931941']

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans,Root,Ancestors,Root_edges,Overlaping,Descendants
55937,NCT03931941,T74,absolute neutrophil count,Observation,1225,1250,"(1225, 1250)",NCT03931941-E20,[Observation],"[Observation, Name, Numeric-Filter, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Unit,..."
55940,NCT03931941,T80,CD4 count,Observation,1180,1189,"(1180, 1189)",NCT03931941-E26,[Observation],"[Observation, Name, Numeric-Filter, During]","([Observation-Name], [Observation-Name], [])","[, Observation, Study, Eq-Comparison, Eq-Unit,..."
55945,NCT03931941,T84,stool test,Observation,369,379,"(369, 379)",NCT03931941-E30,"[Found-By, Observation]","[Observation, Specimen, Temporality, Polarity,...","([Specimen, Observation-Name], [], [])","[, Eq-Value, Study, Eq-Comparison, Polarity, E..."
55967,NCT03931941,T124,adverse event,Observation,1040,1053,"(1040, 1053)",NCT03931941-E58,"[And, Observation]","[Observation, Name, Severity, Caused-By]","([Observation-Name], [Observation-Name], [])","[, Observation, Procedure-Name, Severity, Obse..."


In [106]:
df_ents[df_ents.Mention == '<200/mm3']

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans
55923,NCT03931941,T59,<200/mm3,Eq-Comparison,1190,1198,"(1190, 1198)"


In [66]:
print(df_texts[df_texts.Id == 'NCT03931941'].Text.tolist()[0])

Inclusion Criteria:
  1. ≥ 18 years old.
  2. Medical record documentation of either: a) recurrent CDI per the study definition, that includes at least one recurrence after a primary episode and has completed at least one round of standard-of-care oral antibiotic therapy, b) or has had at least two episodes of severe CDI resulting in hospitalization.
  3. A positive stool test for the presence of toxigenic C. difficile or C. difficile toxin within 30 days prior to or on the date of enrollment.
  4. Is currently taking or was just prescribed antibiotics to control CDI related diarrhea at the time of enrollment.
[Note: Subject's CDI diarrhea must be controlled (<3 unformed/loose stools/day) while
taking antibiotics during screening.]
Exclusion Criteria:
  1. Has continued CDI diarrhea despite being on a course of antibiotics prescribed for CDI treatment.
  2. Requires systemic antibiotic therapy for a condition other than CDI.
  3. Fecal microbiota transplant (FMT) within the past 6 mont

In [67]:
df_ents[df_ents.Mention == 'CD4 count']

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans
51437,NCT03869073,T74,CD4 count,Observation,1088,1097,"(1088, 1097)"
51517,NCT03869073,T161,CD4 count,Observation-Name,1088,1097,"(1088, 1097)"
55940,NCT03931941,T80,CD4 count,Observation,1180,1189,"(1180, 1189)"
55941,NCT03931941,T81,CD4 count,Observation-Name,1180,1189,"(1180, 1189)"


In [61]:
list(set([e for es in df_obs.Root_edges for e in es]))

['Name',
 'Or',
 'Numeric-Filter',
 'Found-By',
 'Duration',
 'Location',
 'Caused-By',
 'Location2',
 'Duration2',
 'Acuteness',
 'Temporality2',
 'Minimum-Count',
 'Stage',
 'Temporality',
 'Stage2',
 'Severity2',
 'Provider',
 'Severity',
 'Polarity2',
 'Example-Of',
 'Abbrev-Of',
 'Polarity',
 'Temporality3',
 'Stability',
 'Specimen2',
 'And',
 'Specimen',
 'Observation',
 'After',
 'Code',
 'Name2',
 'If-Then',
 'Treatment-For',
 'Equivalent-To',
 'Before',
 'Using',
 'During']

## 2.3 Get flattened non-overlapping entities

[Table of content](#TOC)

In [40]:

df_ents.loc[(df_ents.Category == 'Life-Stage-And-Gender'), 'Category'] = 'Person'
df_ents.loc[(df_ents.Category == 'Age'), 'Category'] = 'Person'
df_ents.loc[(df_ents.Category.isin(['Modifier', 'Severity', 'Assertion', 'Stability', 'Acuteness', 'Polarity'])), 'Category'] = 'Qualifier'

In [51]:
df_ents[df_ents.Category == 'Observation']

Unnamed: 0,Id,Entity_id,Mention,Category,Start_char,End_char,Char_spans
32,NCT03865342,T38,Hemoglobin A1C,Observation,121,135,"(121, 135)"
68,NCT03865342,T48,weight loss,Observation,260,271,"(260, 271)"
112,NCT03861130,T44,fever,Observation,337,342,"(337, 342)"
129,NCT03861130,T62,fever,Observation,959,964,"(959, 964)"
133,NCT03861130,T66,echocardiographic evidence,Observation,1028,1054,"(1028, 1054)"
...,...,...,...,...,...,...,...
55967,NCT03931941,T124,adverse event,Observation,1040,1053,"(1040, 1053)"
56026,NCT03931772,T25,smoking,Observation,338,345,"(338, 345)"
56046,NCT03931772,T40,smoker,Observation,308,314,"(308, 314)"
56057,NCT03931772,T61,quit smoking,Observation,99,111,"(99, 111)"


In [36]:
# df_ents.Category.unique().tolist()

categories = [
    # domain
    'Condition',
    'Procedure',
    'Drug',
    'Person', # replaces 'Life-Stage-And-Gender' and 'Age'
    'Laboratory',
    'Observation', # TODO - split into: lab, vital, clinical-score, survey, social-habit
    # 'Device' missing
    
    # field
    'Value',     # replaces part of 'Eq-Comparison'
    'Temporal',  # replaces part of 'Eq-Comparison'
    'Qualifier', # replaces 'Modifier', 'Severity', 'Assertion', 'Stability', 'Acuteness', 'Polarity'
    

    
    # unnecessary duplicate
    # 'Condition-Name',
    # 'Observation-Name',
    # 'Procedure-Name',
    # 'Organism-Name',
    # 'Allergy-Name',
    # 'Immunization-Name',
    # 'Condition-Type',
    # 'Drug-Name',
    
    # fine-grained over 'Eq-Comparison'
    # 'Eq-Value',
    # 'Eq-Operator',
    # 'Eq-Unit',
    # 'Eq-Temporal-Unit',
    # 'Eq-Temporal-Period',
    # 'Eq-Temporal-Recency',
    
    # useless
    # 'Language',
    # 'Provider',
    # 'Study',
    # 'Family-Member',
    # 'Location',
    # 'Birth',
    # 'Death',
    # 'Insurance',
    # 'Ethnicity',
    # 'Risk',
    
    
    # TODO
    # 'Contraindication',
    # 'Criteria-Count',
    
    # other
    # 'Immunization'
    # 'Indication',
    # 'Encounter',
    # 'Negation',
    # 'Exception',
    # 'Coreference',
    # 'Organism',
    # 'Specimen',
    # 'Allergy',
    # 'Other',
    # 'Code',

]

In [38]:
df_ents = get_ner_entities(df_texts, df_ents, categories)

In [40]:
df_ents.shape

(30691, 6)

In [41]:
df_ents.head()

Unnamed: 0,Id,Mention,Start_char,End_char,Entity_id,Category
0,NCT03860012,flammatory bowel disease\n2,25,51,"(T13, T14)",Condition
1,NCT03860012,et,57,59,"(T31, T32)",Eq-Comparison
2,NCT03860012,otrexate at,60,72,"(T15, T16)",Drug
3,NCT03860012,folat,100,106,"(T19,)",Polarity
4,NCT03860012,levels at on,107,120,"(T17, T18)",Observation


In [32]:
df_ents.to_csv(os.path.join(path_to_data, final_dataset_name, 'lct_ents.tsv'), sep = "\t", index = False)

<a id="bio"></a>

## 2.4 Convert entities to BIO format

[Table of content](#TOC)


In [33]:
df_spans = convert_to_bio(df_texts, df_ents)
df_spans.shape

(77302, 4)

In [34]:
df_spans.head()

Unnamed: 0,Id,Sequence_id,Mention,Category
0,NCT00050349_exc,NCT00050349_exc_0,Patients with,O
1,NCT00050349_exc,NCT00050349_exc_0,symptomatic,Qualifier
2,NCT00050349_exc,NCT00050349_exc_0,,O
3,NCT00050349_exc,NCT00050349_exc_0,CNS metastases,Condition
4,NCT00050349_exc,NCT00050349_exc_0,or,O


In [35]:
df_spans.to_csv(os.path.join(path_to_data, final_dataset_name, 'lct_spans.tsv'), sep = "\t", index = False)

In [36]:
tokenizer = English()
df_bio = convert_to_bio(df_texts, df_ents, tokenizer = lambda s: [t.text for t in tokenizer(s)])

df_bio.shape

(205982, 4)

In [37]:
df_bio.head(10)

Unnamed: 0,Id,Sequence_id,Mention,Category
0,NCT00050349_exc,NCT00050349_exc_0,Patients,O
1,NCT00050349_exc,NCT00050349_exc_0,with,O
2,NCT00050349_exc,NCT00050349_exc_0,symptomatic,B-Qualifier
3,NCT00050349_exc,NCT00050349_exc_0,,O
4,NCT00050349_exc,NCT00050349_exc_0,CNS,B-Condition
5,NCT00050349_exc,NCT00050349_exc_0,metastases,I-Condition
6,NCT00050349_exc,NCT00050349_exc_0,,O
7,NCT00050349_exc,NCT00050349_exc_0,or,O
8,NCT00050349_exc,NCT00050349_exc_0,leptomeningeal,B-Condition
9,NCT00050349_exc,NCT00050349_exc_0,involvement,I-Condition


In [38]:
df_bio.to_csv(os.path.join(path_to_data, final_dataset_name, 'lct_bio.tsv'), sep = "\t", index = False)

[Table of content](#TOC)