[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/concepts_ner/concepts_ner_scibert.ipynb)

# Concepts detection

In [None]:
%%capture
!pip install seqeval transformers datasets spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

Mounted at /content/drive
/content/drive/MyDrive/projects/medical_txt_parser


In [None]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [None]:
train_data_path = "data/train"
val_data_path = "data/val"
processed_data_path = "data/processed"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"

task = "ner"  # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "allenai/scibert_scivocab_uncased" # TODO: try with cased
batch_size = 16


### Import data

In [None]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:57<00:00,  2.96it/s]


Unnamed: 0,text,filename,concept
0,Admission Date :\n2017-08-14\nDischarge Date :...,record-142,"{'concept_text': ['cyanotic', 'a more pervasiv..."
1,Admission Date :\n2014-10-21\nDischarge Date :...,record-54,"{'concept_text': ['intraparenchymal bleed', 'i..."
2,Admission Date :\n2017-06-13\nDischarge Date :...,record-105,"{'concept_text': ['left basilar atelectasis', ..."
3,Admission Date :\n2015-10-05\nDischarge Date :...,record-106,"{'concept_text': ['vomiting', 'asa', 'ck-mb', ..."
4,Admission Date :\n2015-06-05\nDischarge Date :...,record-107,"{'concept_text': ['his respiratory distress', ..."


In [None]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,record-142,cyanotic,26,5,26,5,problem
1,record-142,a more pervasive process,169,24,169,27,problem
2,record-142,old twi v4-6,106,0,106,2,problem
3,record-142,"his new , severe global deficit",169,10,169,15,problem
4,record-142,anoxic encephalopathy,169,30,169,31,problem


### Dataset Preprocessing

In [None]:
# check start_line == end_line
concept_df[concept_df["start_line"] != concept_df["end_line"]]

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type


In the following we reformat the dataset to easily label the concepts.

Note:
* We replace multiple spaces with a single space.

In [None]:
# print a random text
preproc_data = {}

for i, row in tqdm(concept_df.iterrows()):
    filename = row["filename"]
    text = df[df["filename"] == filename]["text"].values[0]

    # text preprocessing
    text = text.lower()
    line = text.split("\n")[row["start_line"] - 1]  # NOTE: we assume that start_line == end_line
    line = " ".join(line.split()) # remove multiple spaces
    row["concept_text"] = " ".join(row["concept_text"].split()) # remove multiple spaces

    # find character index start and end of concept
    start_char_index = len(" ".join(line.split()[: row["start_word_number"]]))  # number of chars before concept
    if start_char_index > 0:
        start_char_index += 1
    end_char_index = start_char_index + len(row["concept_text"])
    assert (
        line[start_char_index:end_char_index] == row["concept_text"]
    ), f"concept_text doesn't match the found indexes. '{line[start_char_index:end_char_index]}' != '{row['concept_text']}'"

    line_id = filename + "_" + str(row["start_line"])
    if line_id not in preproc_data:
        preproc_data[line_id] = {
            "text": line,
            "problem": [],
            "test": [],
            "treatment": [],
            # use sets because the indices can repeat for various reasons
            "problem_indices_start": set(),
            "problem_indices_end": set(),
            "test_indices_start": set(),
            "test_indices_end": set(),
            "treatment_indices_start": set(),
            "treatment_indices_end": set(),
        }
    if row["concept_type"] == "problem":
        preproc_data[line_id]["problem"].append(row["concept_text"])
        preproc_data[line_id]["problem_indices_start"].add(start_char_index)
        preproc_data[line_id]["problem_indices_end"].add(end_char_index)
    elif row["concept_type"] == "test":
        preproc_data[line_id]["test"].append(row["concept_text"])
        preproc_data[line_id]["test_indices_start"].add(start_char_index)
        preproc_data[line_id]["test_indices_end"].add(end_char_index)
    elif row["concept_type"] == "treatment":
        preproc_data[line_id]["treatment"].append(row["concept_text"])
        preproc_data[line_id]["treatment_indices_start"].add(start_char_index)
        preproc_data[line_id]["treatment_indices_end"].add(end_char_index)

16525it [00:11, 1471.47it/s]


We extract the paragraph which contains the concept

In [None]:
# extract the paragraph which contains the concept
lines = text.split("\n")
parag_start_line, parag_end_line= 0,-1

for l in range(row["start_line"] - 1, -1, -1):
    if lines[l][-1] == ":":
        parag_start_line = l
        break
for l in range(row["end_line"], len(lines)):
    if lines[l][-1] == ":":
        parag_end_line = l
        break
print("\n".join(lines[parag_start_line:parag_end_line]))

past medical history :
cholecystectomy in 1994 , colonoscopy 2004 , status post tonsillectomy , status post appendectomy , status post orif of left wrist , status post left ear surgery , nsvd x5 , and hypertension .


In [None]:
preproc_df = pd.DataFrame(list(preproc_data.values()))
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,"he then became dusky and cyanotic , and stoppe...","[cyanotic, dusky, stopped breathing]",[],[],"{40, 25, 15}","{33, 20, 57}",{},{},{},{}
1,"however , they felt this event could not accou...","[a more pervasive process, his new , severe gl...",[],[],"{123, 53, 157}","{178, 147, 84}",{},{},{},{}
2,old twi v4-6 .,[old twi v4-6],[],[],{0},{12},{},{},{},{}
3,"atrial fibrillation , occasional v-paced , rat...",[atrial fibrillation],[rate],[occasional v-paced],{0},{19},{43},{47},{22},{40}
4,ekg in john :,[],[ekg],[],{},{},{0},{3},{},{}
...,...,...,...,...,...,...,...,...,...,...
7955,she is a pleasant well appearing elderly woman...,[acute distress],[],[],{53},{67},{},{},{},{}
7956,colon cancer .,[colon cancer],[],[],{0},{12},{},{},{},{}
7957,on exam she is afebrile .,[afebrile],[exam],[],{15},{23},{3},{7},{},{}
7958,6. senna 2 tablets p.o. b.i.d.,[],[],[senna],{},{},{},{},{3},{8}


In [None]:
# since no spans overlap, we can sort to get 1:1 matched index spans
# note that sets don't preserve insertion order

preproc_df["problem_indices_start"] = preproc_df["problem_indices_start"].apply(list).apply(sorted)
preproc_df["problem_indices_end"] = preproc_df["problem_indices_end"].apply(list).apply(sorted)
preproc_df["test_indices_start"] = preproc_df["test_indices_start"].apply(list).apply(sorted)
preproc_df["test_indices_end"] = preproc_df["test_indices_end"].apply(list).apply(sorted)
preproc_df["treatment_indices_start"] = preproc_df["treatment_indices_start"].apply(list).apply(sorted)
preproc_df["treatment_indices_end"] = preproc_df["treatment_indices_end"].apply(list).apply(sorted)
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,"he then became dusky and cyanotic , and stoppe...","[cyanotic, dusky, stopped breathing]",[],[],"[15, 25, 40]","[20, 33, 57]",[],[],[],[]
1,"however , they felt this event could not accou...","[a more pervasive process, his new , severe gl...",[],[],"[53, 123, 157]","[84, 147, 178]",[],[],[],[]
2,old twi v4-6 .,[old twi v4-6],[],[],[0],[12],[],[],[],[]
3,"atrial fibrillation , occasional v-paced , rat...",[atrial fibrillation],[rate],[occasional v-paced],[0],[19],[43],[47],[22],[40]
4,ekg in john :,[],[ekg],[],[],[],[0],[3],[],[]
...,...,...,...,...,...,...,...,...,...,...
7955,she is a pleasant well appearing elderly woman...,[acute distress],[],[],[53],[67],[],[],[],[]
7956,colon cancer .,[colon cancer],[],[],[0],[12],[],[],[],[]
7957,on exam she is afebrile .,[afebrile],[exam],[],[15],[23],[3],[7],[],[]
7958,6. senna 2 tablets p.o. b.i.d.,[],[],[senna],[],[],[],[],[3],[8]


We also add some lines that contain no concepts

In [None]:
# add lines with no concepts
NUMBER_LINE_PER_FILE = 4
no_concepts_df = []
# aggregate start_line by filename
list_lines = concept_df.groupby("filename")["start_line"].apply(set).to_dict()
for filename, line_nums in list_lines.items():
    # split file
    lines = df[df["filename"] == filename]["text"].values[0].split("\n")
    # get two random line that doesnt exist in line_nums
    line_num = np.random.choice(list(set(range(len(lines))) - line_nums), NUMBER_LINE_PER_FILE, replace=False)
    # add to no_concepts_df
    for l in line_num:
        if lines[l] != "":
            no_concepts_df.append({"filename": filename, "text": lines[l]})

no_concepts_df = pd.DataFrame(no_concepts_df)
for col in preproc_df:
    if col != "text":
        # initialize column of empty lists
        no_concepts_df[col] = [[] for _ in range(len(no_concepts_df))]
no_concepts_df


Unnamed: 0,filename,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,018636330_DH,She was discharged on the following medications .,[],[],[],[],[],[],[],[],[]
1,018636330_DH,ADMISSION DATE :,[],[],[],[],[],[],[],[],[]
2,018636330_DH,Discharge Summary,[],[],[],[],[],[],[],[],[]
3,018636330_DH,"KOTE , OA",[],[],[],[],[],[],[],[],[]
4,026350193_RWH,Y,[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
662,record-83,Facility :,[],[],[],[],[],[],[],[],[]
663,record-84,( End of Report ),[],[],[],[],[],[],[],[],[]
664,record-84,Admission Date :,[],[],[],[],[],[],[],[],[]
665,record-84,Discharge Date :,[],[],[],[],[],[],[],[],[]


In [None]:
# add lines with no concepts to preproc_df
preproc_df = preproc_df.append(no_concepts_df.drop(columns=["filename"]), ignore_index=True)
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,"he then became dusky and cyanotic , and stoppe...","[cyanotic, dusky, stopped breathing]",[],[],"[15, 25, 40]","[20, 33, 57]",[],[],[],[]
1,"however , they felt this event could not accou...","[a more pervasive process, his new , severe gl...",[],[],"[53, 123, 157]","[84, 147, 178]",[],[],[],[]
2,old twi v4-6 .,[old twi v4-6],[],[],[0],[12],[],[],[],[]
3,"atrial fibrillation , occasional v-paced , rat...",[atrial fibrillation],[rate],[occasional v-paced],[0],[19],[43],[47],[22],[40]
4,ekg in john :,[],[ekg],[],[],[],[0],[3],[],[]
...,...,...,...,...,...,...,...,...,...,...
8622,Facility :,[],[],[],[],[],[],[],[],[]
8623,( End of Report ),[],[],[],[],[],[],[],[],[]
8624,Admission Date :,[],[],[],[],[],[],[],[],[]
8625,Discharge Date :,[],[],[],[],[],[],[],[],[]


In [None]:
# save to JSON to then import into Dataset object
preproc_df.to_json(os.path.join(processed_data_path, "dataset-3.jsonl"), orient="records", lines=True)

In [None]:
dataset = load_dataset("json", data_files=os.path.join(processed_data_path, "dataset-3.jsonl"))
dataset

Using custom data configuration default-d3a56dfb30e39d95
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-d3a56dfb30e39d95/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 8627
    })
})

In [None]:
# no train-test provided, so we create our own
dataset = dataset["train"].train_test_split(test_size =100)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 8527
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 100
    })
})

### Token Labeling

In [None]:
label_list = ['O', 'B-PROBLEM', 'I-PROBLEM', 'B-TEST', 'I-TEST', 'B-TREATMENT', 'I-TREATMENT']

custom_seq = Sequence(feature=ClassLabel(num_classes=len(label_list),
                                         names=label_list,
                                         names_file=None, id=None), length=-1, id=None)

dataset["train"].features["ner_tags"] = custom_seq
dataset["test"].features["ner_tags"] = custom_seq
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'ner_tags'],
        num_rows: 8527
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'ner_tags'],
        num_rows: 100
    })
})

In [None]:
from tqdm.notebook import tqdm
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
def generate_row_labels(row, verbose=False):
    """ Given a row from the consolidated `Ade_corpus_v2_drug_ade_relation` dataset, 
    generates BIO tags for drug and effect entities. 
    
    """

    text = row["text"]

    labels = []
    label = "O"
    prefix = ""
    
    # while iterating through tokens, increment to traverse all drug and effect spans
    problem_index = 0
    effect_index = 0
    test_index = 0
    treatment_index = 0
    
    tokens = tokenizer(text, return_offsets_mapping=True)

    for n in range(len(tokens["input_ids"])):
        offset_start, offset_end = tokens["offset_mapping"][n]

        # should only happen for [CLS] and [SEP]
        if offset_end - offset_start == 0:
            labels.append(-100)
            continue
        
        if problem_index < len(row["problem_indices_start"]) and offset_start == row["problem_indices_start"][problem_index]:
            label = "PROBLEM"
            prefix = "B-"

        elif test_index < len(row["test_indices_start"]) and offset_start == row["test_indices_start"][test_index]:
            label = "TEST"
            prefix = "B-"

        elif treatment_index < len(row["treatment_indices_start"]) and offset_start == row["treatment_indices_start"][treatment_index]:
            label = "TREATMENT"
            prefix = "B-"
        
        labels.append(label_list.index(f"{prefix}{label}"))
            
        if problem_index < len(row["problem_indices_end"]) and offset_end == row["problem_indices_end"][problem_index]:
            label = "O"
            prefix = ""
            problem_index += 1
            
        elif test_index < len(row["test_indices_end"]) and offset_end == row["test_indices_end"][test_index]:
            label = "O"
            prefix = ""
            test_index += 1

        elif treatment_index < len(row["treatment_indices_end"]) and offset_end == row["treatment_indices_end"][treatment_index]:
            label = "O"
            prefix = ""
            treatment_index += 1

        # need to transition "inside" if we just entered an entity
        if prefix == "B-":
            prefix = "I-"
    
    if verbose:
        pprint(row)
        orig = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
        for n in range(len(labels)):
            print(orig[n], labels[n])
    tokens["labels"] = labels
    
    return tokens

In [None]:
# testing out...

generate_row_labels(dataset["train"][np.random.randint(0, len(dataset["train"]))], verbose=True)

{'problem': [],
 'problem_indices_end': [],
 'problem_indices_start': [],
 'test': [],
 'test_indices_end': [],
 'test_indices_start': [],
 'text': 'aortic valve replacement .',
 'treatment': ['aortic valve replacement'],
 'treatment_indices_end': [24],
 'treatment_indices_start': [0]}
[CLS] -100
aortic 5
valve 6
replacement 6
. 0
[SEP] -100


{'input_ids': [102, 8641, 9042, 7278, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 6), (7, 12), (13, 24), (25, 26), (0, 0)], 'labels': [-100, 5, 6, 6, 0, -100]}

In [None]:
labeled_dataset = dataset.map(generate_row_labels)
labeled_dataset

  0%|          | 0/8527 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 8527
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 100
    })
})

### SciBERT Model Fine-Tuning

In [None]:
# Local model
label_names = ["O", "B-PROBLEM", "I-PROBLEM", "B-TEST", "I-TEST", "B-TREATMENT", "I-TREATMENT"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), label2id=label2id, id2label=id2label)

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [None]:
model_name = model_checkpoint.split("/")[-1]
model_folder_name = f"{model_name}-finetuned-{task}-3"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1
)
data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: offset_mapping, problem_indices_start, treatment_indices_end, test_indices_end, problem_indices_end, test_indices_start, problem, test, text, treatment_indices_start, treatment.
***** Running training *****
  Num examples = 8527
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2665


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1323,0.191513,0.840206,0.88587,0.862434,0.951511
2,0.0812,0.16466,0.890052,0.923913,0.906667,0.956549
3,0.0583,0.15678,0.873016,0.896739,0.884718,0.955919
4,0.0293,0.171091,0.893617,0.913043,0.903226,0.955919
5,0.0944,0.174388,0.889474,0.918478,0.903743,0.956549


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: offset_mapping, problem_indices_start, treatment_indices_end, test_indices_end, problem_indices_end, test_indices_start, problem, test, text, treatment_indices_start, treatment.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: offset_mapping, problem_indices_start, treatment_indices_end, test_indices_end, problem_indices_end, test_indices_start, problem, test, text, treatment_indices_start, treatment.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: offset_mapping, problem_indices_start, treatment_indices_end, test_indic

TrainOutput(global_step=2665, training_loss=0.1841635415480799, metrics={'train_runtime': 480.017, 'train_samples_per_second': 88.82, 'train_steps_per_second': 5.552, 'total_flos': 1105010650838952.0, 'train_loss': 0.1841635415480799, 'epoch': 5.0})

In [None]:
# save model
trainer.save_model(f"models/{model_folder_name}")

Saving model checkpoint to models/scibert_scivocab_uncased-finetuned-ner-3
Configuration saved in models/scibert_scivocab_uncased-finetuned-ner-3/config.json
Model weights saved in models/scibert_scivocab_uncased-finetuned-ner-3/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-finetuned-ner-3/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-finetuned-ner-3/special_tokens_map.json


In [None]:
predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: offset_mapping, test_indices_end, problem, text, problem_indices_end, treatment_indices_start, problem_indices_start, treatment, test, test_indices_start, treatment_indices_end.
***** Running Prediction *****
  Num examples = 2198
  Batch size = 16


{'PROBLEM': {'f1': 0.8303964757709251,
  'number': 1760,
  'precision': 0.8055555555555556,
  'recall': 0.8568181818181818},
 'TEST': {'f1': 0.8490641178813222,
  'number': 1213,
  'precision': 0.8212634822804314,
  'recall': 0.8788128606760099},
 'TREATMENT': {'f1': 0.8327566320645906,
  'number': 1248,
  'precision': 0.8004434589800443,
  'recall': 0.8677884615384616},
 'overall_accuracy': 0.9425913795362243,
 'overall_f1': 0.836459286367795,
 'overall_precision': 0.8085341587441963,
 'overall_recall': 0.8663823738450604}

---
## See Model Outputs

We load our fine-tuned model into a `pipeline` object to run arbitrary input against it.

In [None]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [None]:
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []

    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)

    params = [{"text": sentence, "ents": entities, "title": None}]

    html = displacy.render(
        params,
        style="ent",
        manual=True,
        jupyter=True,
        options={
            "colors": {
                "B-PROBLEM": "#f08080",
                "I-PROBLEM": "#f08080",
                "B-TEST": "#9bddff",
                "I-TEST": "#9bddff",
                "B-TREATMENT": "#ffdab9",
                "I-TREATMENT": "#ffdab9",
            },
        },
    )


In [None]:
# pick 5 random sentences from the test set
for i in range(5):
    index = np.random.randint(0, len(labeled_dataset["test"]))
    visualize_entities(labeled_dataset["test"][index]["text"])
    print(f"Text: {labeled_dataset['test'][index]['text']}")
    print(f"Problems: {labeled_dataset['test'][index]['problem']}")
    print(f"Tests: {labeled_dataset['test'][index]['test']}")
    print(f"Treatments: {labeled_dataset['test'][index]['treatment']}")
    print(f"{'*' * 50}\n")

Text: a biopsy showed moderately to poorly differentiated grade iii-iv adenocarcinoma on the left .
Problems: ['moderately to poorly differentiated grade iii-iv adenocarcinoma on the left']
Tests: ['a biopsy']
Treatments: []
**************************************************



Text: a hickman catheter was placed and the patient was begun on chemotherapy .
Problems: []
Tests: []
Treatments: ['chemotherapy', 'a hickman catheter']
**************************************************



Text: we will put him back on his nexium .
Problems: []
Tests: []
Treatments: ['his nexium']
**************************************************



Text: rule out sepsis .
Problems: ['sepsis']
Tests: []
Treatments: []
**************************************************



Text: novolog ( insulin aspart ) sliding scale ( subcutaneously ) sc ac
Problems: []
Tests: []
Treatments: ['novolog ( insulin aspart )']
**************************************************



## Evaluate the model

In [None]:

# Local model
label_names = ["O", "B-PROBLEM", "I-PROBLEM", "B-TEST", "I-TEST", "B-TREATMENT", "I-TREATMENT"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model_checkpoint = f"models/{model_folder_name}"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)


loading configuration file models/scibert_scivocab_uncased-finetuned-ner-3/config.json
Model config BertConfig {
  "_name_or_path": "models/scibert_scivocab_uncased-finetuned-ner-3",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PROBLEM",
    "2": "I-PROBLEM",
    "3": "B-TEST",
    "4": "I-TEST",
    "5": "B-TREATMENT",
    "6": "I-TREATMENT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-PROBLEM": 1,
    "B-TEST": 3,
    "B-TREATMENT": 5,
    "I-PROBLEM": 2,
    "I-TEST": 4,
    "I-TREATMENT": 6,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transfo

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: problem_indices_end, treatment_indices_start, test_indices_end, test, treatment, treatment_indices_end, text, offset_mapping, test_indices_start, problem_indices_start, problem.
***** Running Prediction *****
  Num examples = 1990
  Batch size = 16


{'PROBLEM': {'precision': 0.8915313225058005,
  'recall': 0.9203592814371258,
  'f1': 0.90571596935769,
  'number': 1670},
 'TEST': {'precision': 0.9132569558101473,
  'recall': 0.9246064623032312,
  'f1': 0.9188966652943598,
  'number': 1207},
 'TREATMENT': {'precision': 0.9067599067599068,
  'recall': 0.9350961538461539,
  'f1': 0.9207100591715977,
  'number': 1248},
 'overall_precision': 0.9024332624616112,
 'overall_recall': 0.926060606060606,
 'overall_f1': 0.9140942809284518,
 'overall_accuracy': 0.9751068856885688}

In [6]:
text_files = glob.glob(val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
df = df[df.text != ""]
# reset index
df = df.reset_index(drop=True)
df

100%|██████████| 128/128 [00:00<00:00, 224.23it/s]


Unnamed: 0,text,filename,line_number
0,006544894,0006,0
1,NVH,0006,1
2,65104826,0006,2
3,1/2/2004 12:00:00 AM,0006,3
4,"VT s/p cardiac cath , stent and amp ; amio loa...",0006,4
...,...,...,...
13612,TD :,0475,129
13613,03/10/97 2:14 P,0475,130
13614,cc :,0475,131
13615,"LENTNY MAYLUTYNA WORK , M.D.",0475,132


In [8]:
lines = df["text"].values
outputs = effect_ner_model(list(lines), aggregation_strategy ="simple") #TODO: add verbose

NameError: name 'effect_ner_model' is not defined

In [36]:
# save outputs as pkl
import joblib
predictions_path = "predictions"
joblib.dump(outputs, os.path.join(predictions_path, "outputs-3-val.pkl"))

['outputs-3-val.pkl']

In [9]:
import joblib
outputs = joblib.load(os.path.join(predictions_path, "outputs-3-val.pkl"))
df["outputs"] = outputs
df

Unnamed: 0,text,filename,line_number,outputs
0,006544894,0006,0,[]
1,NVH,0006,1,"[{'entity_group': 'PROBLEM', 'score': 0.854595..."
2,65104826,0006,2,[]
3,1/2/2004 12:00:00 AM,0006,3,[]
4,"VT s/p cardiac cath , stent and amp ; amio loa...",0006,4,"[{'entity_group': 'PROBLEM', 'score': 0.983178..."
...,...,...,...,...
13545,03/06/97,0475,128,[]
13546,TD :,0475,129,[]
13547,03/10/97 2:14 P,0475,130,[]
13548,cc :,0475,131,[]


In [11]:
# for each file create <filename>.con
os.makedirs(val_data_path + os.sep + concept_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(val_data_path + os.sep + concept_folder_name + os.sep + "*.con")
for file in files:
    os.remove(file)

for i, row in tqdm(df.iterrows()):
    filename = row["filename"]
    text = row["text"]
    with open(val_data_path + os.sep + concept_folder_name + os.sep + filename + ".con", "a") as f:
        for output in row["outputs"]:
            line_num = row["line_number"] + 1
            # switch char indexes to word indexes
            start_char_idx = output["start"] # first char index
            end_char_idx = output["end"] # last char index
            start_word_idx = text[:start_char_idx].count(" ")
            end_word_idx = text[:end_char_idx].count(" ")
            # fill like this c="a workup" 27:2 27:3||t="test"
            f.write(
                f"c=\"{output['word']}\" {line_num}:{start_word_idx} {line_num}:{end_word_idx}||t=\"{output['entity_group'].lower()}\"\n"
            )
    


['data/val/concept/0006.con', 'data/val/concept/0013.con', 'data/val/concept/0017.con', 'data/val/concept/0018.con', 'data/val/concept/0022.con', 'data/val/concept/0025.con', 'data/val/concept/0030.con', 'data/val/concept/0033.con', 'data/val/concept/0034.con', 'data/val/concept/0038.con', 'data/val/concept/0041.con', 'data/val/concept/0045.con', 'data/val/concept/0049.con', 'data/val/concept/0053.con', 'data/val/concept/0061.con', 'data/val/concept/0065.con', 'data/val/concept/0066.con', 'data/val/concept/0069.con', 'data/val/concept/0070.con', 'data/val/concept/0077.con', 'data/val/concept/0082.con', 'data/val/concept/0085.con', 'data/val/concept/0093.con', 'data/val/concept/0101.con', 'data/val/concept/0105.con', 'data/val/concept/0117.con', 'data/val/concept/0121.con', 'data/val/concept/0122.con', 'data/val/concept/0126.con', 'data/val/concept/0133.con', 'data/val/concept/0134.con', 'data/val/concept/0137.con', 'data/val/concept/0138.con', 'data/val/concept/0141.con', 'data/val/con

13550it [00:46, 290.43it/s]
