# Preprocess into spacy's Doc/DocBin format

In [None]:
import gzip
import re

import pandas as pd
import spacy

## Load data and convert spans

In [None]:
with gzip.open(
    "../input/nbme-score-clinical-patient-notes/train_split.json.gz", "r"
) as fin:
    df = pd.read_json(fin)

In [None]:
def process_spans(span_str: str):
    """
    Sanitize spans by converting them to lists of lists of two integer tuples.
    This seems to handle the best way to take care of both comma- and semicolon-based
    delimitation of spans. Examples:
    ['696 724'] -> [[(696, 724)]]
    ['501 517', '482 488;522 530'] -> [[(501, 517)], [(482, 488), (522, 530)]]
    """
    spans = []
    for s in span_str:
        current_s = []
        ints = re.findall(r"\d+", s)
        assert len(ints) % 2 == 0
        for i in range(0, len(ints), 2):
            current_s.append((int(ints[i]), int(ints[i + 1])))
        spans.append(current_s)
    return spans

df.location = df.location.apply(process_spans)

In [None]:
df.head()

### Validating spans

In [None]:
for i, r in df.iterrows():
    for expected, span in zip(r.annotation, r.location):
        actual = ""
        sep = ""
        for sub in span:
            actual += sep + r.pn_history[sub[0]:sub[1]]
            sep = " "
        if expected != actual:
            print(f"Row: {i}")
            print(f"Expected: {expected}")
            print(f"Actual: {actual}")

## Select one example case and feature number

In [None]:
case_num = 0
feature_num = 0
sub_df = df.loc[(df.case_num == case_num) & (df.feature_num == feature_num), :]

In [None]:
sub_df.head()

In [None]:
sub_df.fold.value_counts()

In [None]:
feature_text = sub_df.feature_text.unique()
assert len(feature_text) == 1
feature_text[0]

### Visualizing spans

In [None]:
sub_df.location

In [None]:
def vis_row(row: pd.Series):
    ents = []
    for span in row.location:
        for sub in span:
            ents.append({"start": sub[0], "end": sub[1], "label": ""})
    doc = {
        "text": row.pn_history,
        "ents": ents,
    }
    colors = {"": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = {"colors": colors}
    spacy.displacy.render(doc, style="ent", options=options, manual=True, jupyter=True);

In [None]:
vis_row(sub_df.loc[1287])