# Concepts detection

In [49]:
%%capture
!pip install seqeval transformers datasets spacy

In [15]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/projects/medical_txt_parser


In [16]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [17]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"

task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "allenai/scibert_scivocab_uncased"
batch_size = 16

### Import data

In [18]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:47<00:00,  3.59it/s]


Unnamed: 0,text,filename,concept
0,Admission Date :\n2017-08-14\nDischarge Date :...,record-142,"{'concept_text': ['cyanotic', 'a more pervasiv..."
1,Admission Date :\n2014-10-21\nDischarge Date :...,record-54,"{'concept_text': ['intraparenchymal bleed', 'i..."
2,Admission Date :\n2017-06-13\nDischarge Date :...,record-105,"{'concept_text': ['left basilar atelectasis', ..."
3,Admission Date :\n2015-10-05\nDischarge Date :...,record-106,"{'concept_text': ['vomiting', 'asa', 'ck-mb', ..."
4,Admission Date :\n2015-06-05\nDischarge Date :...,record-107,"{'concept_text': ['his respiratory distress', ..."


In [19]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,record-142,cyanotic,26,5,26,5,problem
1,record-142,a more pervasive process,169,24,169,27,problem
2,record-142,old twi v4-6,106,0,106,2,problem
3,record-142,"his new , severe global deficit",169,10,169,15,problem
4,record-142,anoxic encephalopathy,169,30,169,31,problem


### Dataset Preprocessing

In [20]:
# check start_line == end_line
concept_df[concept_df["start_line"] != concept_df["end_line"]]

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type


In the following we reformat the dataset to easily label the concepts.

Note:
* We replace multiple spaces with a single space.

In [21]:
# print a random text
preproc_data = {}

for i, row in tqdm(concept_df.iterrows()):
    filename = row["filename"]
    text = df[df["filename"] == filename]["text"].values[0]

    # text preprocessing
    text = text.lower()
    line = text.split("\n")[row["start_line"] - 1]  # NOTE: we assume that start_line == end_line
    line = " ".join(line.split()) # remove multiple spaces
    row["concept_text"] = " ".join(row["concept_text"].split()) # remove multiple spaces

    # find character index start and end of concept
    start_char_index = len(" ".join(line.split()[: row["start_word_number"]]))  # number of chars before concept
    if start_char_index > 0:
        start_char_index += 1
    end_char_index = start_char_index + len(row["concept_text"])
    assert (
        line[start_char_index:end_char_index] == row["concept_text"]
    ), f"concept_text doesn't match the found indexes. '{line[start_char_index:end_char_index]}' != '{row['concept_text']}'"

    line_id = filename + "_" + str(row["start_line"])
    if line_id not in preproc_data:
        preproc_data[line_id] = {
            "text": line,
            "problem": [],
            "test": [],
            "treatment": [],
            # use sets because the indices can repeat for various reasons
            "problem_indices_start": set(),
            "problem_indices_end": set(),
            "test_indices_start": set(),
            "test_indices_end": set(),
            "treatment_indices_start": set(),
            "treatment_indices_end": set(),
        }
    if row["concept_type"] == "problem":
        preproc_data[line_id]["problem"].append(row["concept_text"])
        preproc_data[line_id]["problem_indices_start"].add(start_char_index)
        preproc_data[line_id]["problem_indices_end"].add(end_char_index)
    elif row["concept_type"] == "test":
        preproc_data[line_id]["test"].append(row["concept_text"])
        preproc_data[line_id]["test_indices_start"].add(start_char_index)
        preproc_data[line_id]["test_indices_end"].add(end_char_index)
    elif row["concept_type"] == "treatment":
        preproc_data[line_id]["treatment"].append(row["concept_text"])
        preproc_data[line_id]["treatment_indices_start"].add(start_char_index)
        preproc_data[line_id]["treatment_indices_end"].add(end_char_index)

16525it [00:10, 1531.25it/s]


We extract the paragraph which contains the concept

In [22]:
# extract the paragraph which contains the concept
lines = text.split("\n")
parag_start_line, parag_end_line= 0,-1

for l in range(row["start_line"] - 1, -1, -1):
    if lines[l][-1] == ":":
        parag_start_line = l
        break
for l in range(row["end_line"], len(lines)):
    if lines[l][-1] == ":":
        parag_end_line = l
        break
print("\n".join(lines[parag_start_line:parag_end_line]))

past medical history :
cholecystectomy in 1994 , colonoscopy 2004 , status post tonsillectomy , status post appendectomy , status post orif of left wrist , status post left ear surgery , nsvd x5 , and hypertension .


In [23]:
preproc_df = pd.DataFrame(list(preproc_data.values()))
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,"he then became dusky and cyanotic , and stoppe...","[cyanotic, dusky, stopped breathing]",[],[],"{40, 25, 15}","{33, 20, 57}",{},{},{},{}
1,"however , they felt this event could not accou...","[a more pervasive process, his new , severe gl...",[],[],"{123, 53, 157}","{178, 147, 84}",{},{},{},{}
2,old twi v4-6 .,[old twi v4-6],[],[],{0},{12},{},{},{},{}
3,"atrial fibrillation , occasional v-paced , rat...",[atrial fibrillation],[rate],[occasional v-paced],{0},{19},{43},{47},{22},{40}
4,ekg in john :,[],[ekg],[],{},{},{0},{3},{},{}
...,...,...,...,...,...,...,...,...,...,...
7955,she is a pleasant well appearing elderly woman...,[acute distress],[],[],{53},{67},{},{},{},{}
7956,colon cancer .,[colon cancer],[],[],{0},{12},{},{},{},{}
7957,on exam she is afebrile .,[afebrile],[exam],[],{15},{23},{3},{7},{},{}
7958,6. senna 2 tablets p.o. b.i.d.,[],[],[senna],{},{},{},{},{3},{8}


In [24]:
# since no spans overlap, we can sort to get 1:1 matched index spans
# note that sets don't preserve insertion order

preproc_df["problem_indices_start"] = preproc_df["problem_indices_start"].apply(list).apply(sorted)
preproc_df["problem_indices_end"] = preproc_df["problem_indices_end"].apply(list).apply(sorted)
preproc_df["test_indices_start"] = preproc_df["test_indices_start"].apply(list).apply(sorted)
preproc_df["test_indices_end"] = preproc_df["test_indices_end"].apply(list).apply(sorted)
preproc_df["treatment_indices_start"] = preproc_df["treatment_indices_start"].apply(list).apply(sorted)
preproc_df["treatment_indices_end"] = preproc_df["treatment_indices_end"].apply(list).apply(sorted)
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,"he then became dusky and cyanotic , and stoppe...","[cyanotic, dusky, stopped breathing]",[],[],"[15, 25, 40]","[20, 33, 57]",[],[],[],[]
1,"however , they felt this event could not accou...","[a more pervasive process, his new , severe gl...",[],[],"[53, 123, 157]","[84, 147, 178]",[],[],[],[]
2,old twi v4-6 .,[old twi v4-6],[],[],[0],[12],[],[],[],[]
3,"atrial fibrillation , occasional v-paced , rat...",[atrial fibrillation],[rate],[occasional v-paced],[0],[19],[43],[47],[22],[40]
4,ekg in john :,[],[ekg],[],[],[],[0],[3],[],[]
...,...,...,...,...,...,...,...,...,...,...
7955,she is a pleasant well appearing elderly woman...,[acute distress],[],[],[53],[67],[],[],[],[]
7956,colon cancer .,[colon cancer],[],[],[0],[12],[],[],[],[]
7957,on exam she is afebrile .,[afebrile],[exam],[],[15],[23],[3],[7],[],[]
7958,6. senna 2 tablets p.o. b.i.d.,[],[],[senna],[],[],[],[],[3],[8]


In [25]:
# save to JSON to then import into Dataset object
preproc_df.to_json("dataset.jsonl", orient="records", lines=True)

In [26]:
dataset = load_dataset("json", data_files="dataset.jsonl")
dataset

Using custom data configuration default-672f128c8a5b18e4


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-672f128c8a5b18e4/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-672f128c8a5b18e4/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 7960
    })
})

In [27]:
# no train-test provided, so we create our own
dataset = dataset["train"].train_test_split()
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 5970
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 1990
    })
})

### Token Labeling

In [28]:
label_list = ['O', 'B-PROBLEM', 'I-PROBLEM', 'B-TEST', 'I-TEST', 'B-TREATMENT', 'I-TREATMENT']

custom_seq = Sequence(feature=ClassLabel(num_classes=len(label_list),
                                         names=label_list,
                                         names_file=None, id=None), length=-1, id=None)

dataset["train"].features["ner_tags"] = custom_seq
dataset["test"].features["ner_tags"] = custom_seq
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'ner_tags'],
        num_rows: 5970
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'ner_tags'],
        num_rows: 1990
    })
})

In [29]:
from tqdm.notebook import tqdm
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [30]:
def generate_row_labels(row, verbose=False):
    """ Given a row from the consolidated `Ade_corpus_v2_drug_ade_relation` dataset, 
    generates BIO tags for drug and effect entities. 
    
    """

    text = row["text"]

    labels = []
    label = "O"
    prefix = ""
    
    # while iterating through tokens, increment to traverse all drug and effect spans
    problem_index = 0
    effect_index = 0
    test_index = 0
    treatment_index = 0
    
    tokens = tokenizer(text, return_offsets_mapping=True)

    for n in range(len(tokens["input_ids"])):
        offset_start, offset_end = tokens["offset_mapping"][n]

        # should only happen for [CLS] and [SEP]
        if offset_end - offset_start == 0:
            labels.append(-100)
            continue
        
        if problem_index < len(row["problem_indices_start"]) and offset_start == row["problem_indices_start"][problem_index]:
            label = "PROBLEM"
            prefix = "B-"

        elif test_index < len(row["test_indices_start"]) and offset_start == row["test_indices_start"][test_index]:
            label = "TEST"
            prefix = "B-"

        elif treatment_index < len(row["treatment_indices_start"]) and offset_start == row["treatment_indices_start"][treatment_index]:
            label = "TREATMENT"
            prefix = "B-"
        
        labels.append(label_list.index(f"{prefix}{label}"))
            
        if problem_index < len(row["problem_indices_end"]) and offset_end == row["problem_indices_end"][problem_index]:
            label = "O"
            prefix = ""
            problem_index += 1
            
        elif test_index < len(row["test_indices_end"]) and offset_end == row["test_indices_end"][test_index]:
            label = "O"
            prefix = ""
            test_index += 1

        elif treatment_index < len(row["treatment_indices_end"]) and offset_end == row["treatment_indices_end"][treatment_index]:
            label = "O"
            prefix = ""
            treatment_index += 1

        # need to transition "inside" if we just entered an entity
        if prefix == "B-":
            prefix = "I-"
    
    if verbose:
        pprint(row)
        orig = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
        for n in range(len(labels)):
            print(orig[n], labels[n])
    tokens["labels"] = labels
    
    return tokens

In [31]:
# testing out...

generate_row_labels(dataset["train"][np.random.randint(0, len(dataset["train"]))], verbose=True)

{'problem': ['mi'],
 'problem_indices_end': [13],
 'problem_indices_start': [11],
 'test': [],
 'test_indices_end': [],
 'test_indices_start': [],
 'text': 'father had mi at 42',
 'treatment': [],
 'treatment_indices_end': [],
 'treatment_indices_start': []}
[CLS] -100
father 0
had 0
mi 1
at 0
42 0
[SEP] -100


{'input_ids': [102, 14669, 883, 4323, 235, 4637, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 6), (7, 10), (11, 13), (14, 16), (17, 19), (0, 0)], 'labels': [-100, 0, 0, 1, 0, 0, -100]}

In [32]:
labeled_dataset = dataset.map(generate_row_labels)
labeled_dataset

  0%|          | 0/5970 [00:00<?, ?ex/s]

  0%|          | 0/1990 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 5970
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 1990
    })
})

### SciBERT Model Fine-Tuning

In [33]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [34]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1
)
data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [36]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: problem, problem_indices_end, treatment, test, test_indices_start, test_indices_end, problem_indices_start, treatment_indices_end, offset_mapping, treatment_indices_start, text.
***** Running training *****
  Num examples = 5970
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1870


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0804,0.210439,0.763345,0.812885,0.787337,0.935246
2,0.0046,0.17381,0.808244,0.854571,0.830762,0.946934
3,0.0048,0.175523,0.828721,0.86523,0.846582,0.949094
4,0.0144,0.177546,0.82685,0.870914,0.84831,0.951255
5,0.0025,0.179488,0.831605,0.873757,0.85216,0.951864


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: problem, problem_indices_end, treatment, test, test_indices_start, test_indices_end, problem_indices_start, treatment_indices_end, offset_mapping, treatment_indices_start, text.
***** Running Evaluation *****
  Num examples = 1990
  Batch size = 16
Saving model checkpoint to scibert_scivocab_uncased-finetuned-ner/checkpoint-500
Configuration saved in scibert_scivocab_uncased-finetuned-ner/checkpoint-500/config.json
Model weights saved in scibert_scivocab_uncased-finetuned-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in scibert_scivocab_uncased-finetuned-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in scibert_scivocab_uncased-finetuned-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and hav

TrainOutput(global_step=1870, training_loss=0.16897597493588884, metrics={'train_runtime': 388.6147, 'train_samples_per_second': 76.811, 'train_steps_per_second': 4.812, 'total_flos': 759442317736344.0, 'train_loss': 0.16897597493588884, 'epoch': 5.0})

In [37]:
predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: problem, problem_indices_end, treatment, test, test_indices_start, test_indices_end, problem_indices_start, treatment_indices_end, offset_mapping, treatment_indices_start, text.
***** Running Prediction *****
  Num examples = 1990
  Batch size = 16


{'PROBLEM': {'f1': 0.8394604601957154,
  'number': 1845,
  'precision': 0.8197314049586777,
  'recall': 0.8601626016260162},
 'TEST': {'f1': 0.8727422003284073,
  'number': 1193,
  'precision': 0.8551890587288817,
  'recall': 0.8910310142497905},
 'TREATMENT': {'f1': 0.8512904547316673,
  'number': 1184,
  'precision': 0.8265712012728719,
  'recall': 0.8775337837837838},
 'overall_accuracy': 0.9518639561291752,
 'overall_f1': 0.8521598521598522,
 'overall_precision': 0.831605049594229,
 'overall_recall': 0.8737565135007106}

---
## See Model Outputs

We load our fine-tuned model into a `pipeline` object to run arbitrary input against it.

In [38]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [39]:
# something from our validation set
effect_ner_model(labeled_dataset["test"][4]["text"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'end': 1,
  'entity': 'LABEL_0',
  'index': 1,
  'score': 0.99968517,
  'start': 0,
  'word': '7'},
 {'end': 2,
  'entity': 'LABEL_0',
  'index': 2,
  'score': 0.9993272,
  'start': 1,
  'word': ')'},
 {'end': 6,
  'entity': 'LABEL_5',
  'index': 3,
  'score': 0.9974396,
  'start': 3,
  'word': 'kay'},
 {'end': 9,
  'entity': 'LABEL_6',
  'index': 4,
  'score': 0.99749196,
  'start': 7,
  'word': 'ci'},
 {'end': 11,
  'entity': 'LABEL_6',
  'index': 5,
  'score': 0.9984199,
  'start': 9,
  'word': '##el'},
 {'end': 14,
  'entity': 'LABEL_0',
  'index': 6,
  'score': 0.9998369,
  'start': 12,
  'word': '20'},
 {'end': 17,
  'entity': 'LABEL_0',
  'index': 7,
  'score': 0.99982077,
  'start': 15,
  'word': 'me'},
 {'end': 18,
  'entity': 'LABEL_0',
  'index': 8,
  'score': 0.99973756,
  'start': 17,
  'word': '##q'},
 {'end': 20,
  'entity': 'LABEL_0',
  'index': 9,
  'score': 0.9998561,
  'start': 19,
  'word': 'p'},
 {'end': 21,
  'entity': 'LABEL_0',
  'index': 10,
  'score': 0.9998

In [57]:
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []

    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)

    params = [{"text": sentence, "ents": entities, "title": None}]

    html = displacy.render(
        params,
        style="ent",
        manual=True,
        jupyter=True,
        options={
            "colors": {
                "B-PROBLEM": "#f08080",
                "I-PROBLEM": "#f08080",
                "B-TEST": "#9bddff",
                "I-TEST": "#9bddff",
                "B-TREATMENT": "#ffdab9",
                "I-TREATMENT": "#ffdab9",
            },
        },
    )


In [61]:
# pick 5 random sentences from the test set
for i in range(5):
    index = np.random.randint(0, len(labeled_dataset["test"]))
    visualize_entities(labeled_dataset["test"][index]["text"])
    print(f"Text: {labeled_dataset['test'][index]['text']}")
    print(f"Problems: {labeled_dataset['test'][index]['problem']}")
    print(f"Tests: {labeled_dataset['test'][index]['test']}")
    print(f"Treatments: {labeled_dataset['test'][index]['treatment']}")
    print(f"{'*' * 50}\n")

Text: a biopsy showed moderately to poorly differentiated grade iii-iv adenocarcinoma on the left .
Problems: ['moderately to poorly differentiated grade iii-iv adenocarcinoma on the left']
Tests: ['a biopsy']
Treatments: []
**************************************************



Text: a hickman catheter was placed and the patient was begun on chemotherapy .
Problems: []
Tests: []
Treatments: ['chemotherapy', 'a hickman catheter']
**************************************************



Text: we will put him back on his nexium .
Problems: []
Tests: []
Treatments: ['his nexium']
**************************************************



Text: rule out sepsis .
Problems: ['sepsis']
Tests: []
Treatments: []
**************************************************



Text: novolog ( insulin aspart ) sliding scale ( subcutaneously ) sc ac
Problems: []
Tests: []
Treatments: ['novolog ( insulin aspart )']
**************************************************

