# Preprocess into spacy's Doc/DocBin format

In [1]:
import gzip
import re
from pathlib import Path

import pandas as pd
import spacy
from spacy.tokens import DocBin

In [2]:
output_dir = Path("corpus")

In [3]:
nlp = spacy.blank("en")

## Load data and convert spans

In [4]:
with gzip.open(
    "../input/nbme-score-clinical-patient-notes/train_split.json.gz", "r"
) as fin:
    df = pd.read_json(fin)

In [5]:
def process_spans(span_str: str):
    """
    Sanitize spans by converting them to lists of lists of two integer tuples.
    This seems to handle the best way to take care of both comma- and semicolon-based
    delimitation of spans. Examples:
    ['696 724'] -> [[(696, 724)]]
    ['501 517', '482 488;522 530'] -> [[(501, 517)], [(482, 488), (522, 530)]]
    """
    spans = []
    for s in span_str:
        current_s = []
        ints = re.findall(r"\d+", s)
        assert len(ints) % 2 == 0
        for i in range(0, len(ints), 2):
            current_s.append((int(ints[i]), int(ints[i + 1])))
        spans.append(current_s)
    return spans


df.location = df.location.apply(process_spans)

In [6]:
df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold
0,16000,0,16,0,[dad with recent heart attcak],"[[(696, 724)]]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,4
1,16001,0,16,1,"[mom with ""thyroid disease]","[[(668, 693)]]",Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,4
2,16002,0,16,2,[chest pressure],"[[(203, 217)]]",Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,4
3,16003,0,16,3,"[intermittent episodes, episode]","[[(70, 91)], [(176, 183)]]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,4
4,16004,0,16,4,[felt as if he were going to pass out],"[[(222, 258)]]",Lightheaded,HPI: 17yo M presents with palpitations. Patien...,4


### Validating spans

In [7]:
for i, r in df.iterrows():
    for expected, span in zip(r.annotation, r.location):
        actual = ""
        sep = ""
        for sub in span:
            actual += sep + r.pn_history[sub[0] : sub[1]]
            sep = " "
        if expected != actual:
            print(f"Row: {i}")
            print(f"Expected: {expected}")
            print(f"Actual: {actual}")

## Select one example case and feature number

In [8]:
case_num = 0
feature_num = 0
sub_df = df.loc[(df.case_num == case_num) & (df.feature_num == feature_num), :]

In [9]:
sub_df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold
0,16000,0,16,0,[dad with recent heart attcak],"[[(696, 724)]]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,4
13,41000,0,41,0,[],[],Family-history-of-MI-OR-Family-history-of-myoc...,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,2
26,46000,0,46,0,[father: heart attack],"[[(824, 844)]]",Family-history-of-MI-OR-Family-history-of-myoc...,Mr. Cleveland is a 17yo M who was consented by...,1
39,82000,0,82,0,[Father MI],"[[(622, 631)]]",Family-history-of-MI-OR-Family-history-of-myoc...,17 yo M w/ no cardiac or arrhythmia PMH presen...,0
52,100000,0,100,0,[Dad-MI],"[[(735, 741)]]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: Dillon Cleveland is an otherwise healthy ...,4


In [10]:
sub_df.fold.value_counts()

4    20
2    20
1    20
0    20
3    20
Name: fold, dtype: int64

In [11]:
feature_text = sub_df.feature_text.unique()
assert len(feature_text) == 1
feature_text[0]

'Family-history-of-MI-OR-Family-history-of-myocardial-infarction'

### Visualizing spans

In [12]:
sub_df.location

0                     [[(696, 724)]]
13                                []
26                    [[(824, 844)]]
39                    [[(622, 631)]]
52                    [[(735, 741)]]
                    ...             
1235      [[(750, 756), (761, 763)]]
1248                              []
1261      [[(589, 595), (606, 618)]]
1274      [[(845, 847), (855, 861)]]
1287    [[(915, 924)], [(545, 570)]]
Name: location, Length: 100, dtype: object

In [13]:
def vis_row(row: pd.Series):
    ents = []
    for span in row.location:
        for sub in span:
            ents.append({"start": sub[0], "end": sub[1], "label": ""})
    doc = {
        "text": row.pn_history,
        "ents": ents,
    }
    colors = {"": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = {"colors": colors}
    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True);

In [14]:
vis_row(sub_df.loc[533])

## Convert to Doc

Add NER annotation and metadata.

In [15]:
sub_df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold
0,16000,0,16,0,[dad with recent heart attcak],"[[(696, 724)]]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,4
13,41000,0,41,0,[],[],Family-history-of-MI-OR-Family-history-of-myoc...,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,2
26,46000,0,46,0,[father: heart attack],"[[(824, 844)]]",Family-history-of-MI-OR-Family-history-of-myoc...,Mr. Cleveland is a 17yo M who was consented by...,1
39,82000,0,82,0,[Father MI],"[[(622, 631)]]",Family-history-of-MI-OR-Family-history-of-myoc...,17 yo M w/ no cardiac or arrhythmia PMH presen...,0
52,100000,0,100,0,[Dad-MI],"[[(735, 741)]]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: Dillon Cleveland is an otherwise healthy ...,4


In [16]:
db = DocBin()
for _, row in sub_df.iterrows():
    doc = nlp(row.pn_history)
    doc.user_data = {
        "id": row.id,
        "case_num": row.case_num,
        "pn_num": row.pn_num,
        "feature_num": row.feature_num,
    }
    ents = []
    for span in row.location:
        for start, end in span:
            # NOTE: expand to spacy default tokenization
            char_span = doc.char_span(start, end, alignment_mode="expand")
            if char_span is None:
                raise ValueError()
            ents.append(char_span)
    doc.ents = ents
    db.add(doc)

## Save train and dev sets as DocBin

In [17]:
# db.to_disk("./train.spacy")