[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/concepts_ner/concepts_ner_scibert.ipynb)

# Concepts detection

In [4]:
%%capture
!pip install seqeval transformers datasets spacy

In [5]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/medical_txt_parser/src/notebooks/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/medical_txt_parser/src/notebooks


In [6]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy
from transformers import (AutoModelForTokenClassification, 
                          AutoTokenizer, 
                          DataCollatorForTokenClassification,
                          pipeline,
                          TrainingArguments, 
                          Trainer)

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

/content/drive/MyDrive/medical_txt_parser/src
/content/drive/MyDrive/medical_txt_parser


In [7]:
train_data_path = "data/train"
val_data_path = "data/val"
processed_data_path = "data/processed"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"

task = "ner"  # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "giacomomiolo/electramed_base_scivocab_1M"
batch_size = 16


### Import data

In [8]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:00<00:00, 278.68it/s]


Unnamed: 0,text,filename,concept
0,Admission Date :\n2018-03-04\nDischarge Date :...,record-108,"{'concept_text': ['chills', 'mitochondrial myo..."
1,Admission Date:\n2011-02-08\nDischarge Date :\...,record-17,{'concept_text': ['community-acquired pneumoni...
2,Admission Date :\n2016-12-30\nDischarge Date :...,record-26,"{'concept_text': ['cardiopulmonary bypass', 'a..."
3,Admission Date :\n2012-01-20\nDischarge Date :...,record-53,"{'concept_text': ['his intravenous fluids', 'h..."
4,Admission Date :\n2013-05-18\nDischarge Date :...,record-37,"{'concept_text': ['mmp', 'a hip and arm fractu..."


In [9]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,record-108,chills,96,30,96,30,problem
1,record-108,mitochondrial myopathy,27,1,27,2,problem
2,record-108,two transurethral resection of the prostate,20,14,20,19,treatment
3,record-108,chronic prostatitis,20,10,20,11,problem
4,record-108,supraventricular tachycardia,26,1,26,2,problem


### Dataset Preprocessing

In [10]:
# check start_line == end_line
concept_df[concept_df["start_line"] != concept_df["end_line"]]

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type


In the following we reformat the dataset to easily label the concepts.

Note:
* We replace multiple spaces with a single space.

In [11]:
# print a random text
preproc_data = {}

for i, row in tqdm(concept_df.iterrows()):
    filename = row["filename"]
    text = df[df["filename"] == filename]["text"].values[0]

    # text preprocessing
    text = text.lower()
    line = text.split("\n")[row["start_line"] - 1]  # NOTE: we assume that start_line == end_line
    line = " ".join(line.split()) # remove multiple spaces
    row["concept_text"] = " ".join(row["concept_text"].split()) # remove multiple spaces

    # find character index start and end of concept
    start_char_index = len(" ".join(line.split()[: row["start_word_number"]]))  # number of chars before concept
    if start_char_index > 0:
        start_char_index += 1
    end_char_index = start_char_index + len(row["concept_text"])
    assert (
        line[start_char_index:end_char_index] == row["concept_text"]
    ), f"concept_text doesn't match the found indexes. '{line[start_char_index:end_char_index]}' != '{row['concept_text']}'"

    line_id = filename + "_" + str(row["start_line"])
    if line_id not in preproc_data:
        preproc_data[line_id] = {
            "text": line,
            "problem": [],
            "test": [],
            "treatment": [],
            # use sets because the indices can repeat for various reasons
            "problem_indices_start": set(),
            "problem_indices_end": set(),
            "test_indices_start": set(),
            "test_indices_end": set(),
            "treatment_indices_start": set(),
            "treatment_indices_end": set(),
        }
    if row["concept_type"] == "problem":
        preproc_data[line_id]["problem"].append(row["concept_text"])
        preproc_data[line_id]["problem_indices_start"].add(start_char_index)
        preproc_data[line_id]["problem_indices_end"].add(end_char_index)
    elif row["concept_type"] == "test":
        preproc_data[line_id]["test"].append(row["concept_text"])
        preproc_data[line_id]["test_indices_start"].add(start_char_index)
        preproc_data[line_id]["test_indices_end"].add(end_char_index)
    elif row["concept_type"] == "treatment":
        preproc_data[line_id]["treatment"].append(row["concept_text"])
        preproc_data[line_id]["treatment_indices_start"].add(start_char_index)
        preproc_data[line_id]["treatment_indices_end"].add(end_char_index)

16525it [00:10, 1575.04it/s]


We extract the paragraph which contains the concept

In [12]:
# extract the paragraph which contains the concept
lines = text.split("\n")
parag_start_line, parag_end_line= 0,-1

for l in range(row["start_line"] - 1, -1, -1):
    if lines[l][-1] == ":":
        parag_start_line = l
        break
for l in range(row["end_line"], len(lines)):
    if lines[l][-1] == ":":
        parag_end_line = l
        break
print("\n".join(lines[parag_start_line:parag_end_line]))

disposition , follow up and instructions to patient :
please follow up with dr. wires on monday .
if you have continued fevers , worsening abdominal pain , discharge or other worrisome symptoms , please return to the emergency department .


In [13]:
preproc_df = pd.DataFrame(list(preproc_data.values()))
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,go to an emergency room if you experience symp...,"[chills, fevers, new and continuing nausea, vo...",[],[],"{96, 135, 169, 42, 124, 157}","{163, 132, 141, 50, 121, 188}",{},{},{},{}
1,3. mitochondrial myopathy .,[mitochondrial myopathy],[],[],{3},{25},{},{},{},{}
2,justin searle a 60-year-old man with a long hi...,[chronic prostatitis],[],[two transurethral resection of the prostate],{55},{74},{},{},{87},{130}
3,2. supraventricular tachycardia ( on a beta bl...,[supraventricular tachycardia],[],[a beta blocker],{3},{31},{},{},{37},{51}
4,turp,[],[],[turp],{},{},{},{},{0},{4}
...,...,...,...,...,...,...,...,...,...,...
7955,vicodin,[],[],[vicodin],{},{},{},{},{0},{7}
7956,abd pain/ pelvic pain,"[pelvic pain, abd pain/]",[],[],"{0, 10}","{9, 21}",{},{},{},{}
7957,"if you have continued fevers , worsening abdom...","[worsening abdominal pain, other worrisome sym...",[],[],"{58, 22, 71, 31}","{67, 28, 95, 55}",{},{},{},{}
7958,abdominal pain treatment rendered :,[abdominal pain],[],[],{0},{14},{},{},{},{}


In [14]:
# since no spans overlap, we can sort to get 1:1 matched index spans
# note that sets don't preserve insertion order

preproc_df["problem_indices_start"] = preproc_df["problem_indices_start"].apply(list).apply(sorted)
preproc_df["problem_indices_end"] = preproc_df["problem_indices_end"].apply(list).apply(sorted)
preproc_df["test_indices_start"] = preproc_df["test_indices_start"].apply(list).apply(sorted)
preproc_df["test_indices_end"] = preproc_df["test_indices_end"].apply(list).apply(sorted)
preproc_df["treatment_indices_start"] = preproc_df["treatment_indices_start"].apply(list).apply(sorted)
preproc_df["treatment_indices_end"] = preproc_df["treatment_indices_end"].apply(list).apply(sorted)
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,go to an emergency room if you experience symp...,"[chills, fevers, new and continuing nausea, vo...",[],[],"[42, 96, 124, 135, 157, 169]","[50, 121, 132, 141, 163, 188]",[],[],[],[]
1,3. mitochondrial myopathy .,[mitochondrial myopathy],[],[],[3],[25],[],[],[],[]
2,justin searle a 60-year-old man with a long hi...,[chronic prostatitis],[],[two transurethral resection of the prostate],[55],[74],[],[],[87],[130]
3,2. supraventricular tachycardia ( on a beta bl...,[supraventricular tachycardia],[],[a beta blocker],[3],[31],[],[],[37],[51]
4,turp,[],[],[turp],[],[],[],[],[0],[4]
...,...,...,...,...,...,...,...,...,...,...
7955,vicodin,[],[],[vicodin],[],[],[],[],[0],[7]
7956,abd pain/ pelvic pain,"[pelvic pain, abd pain/]",[],[],"[0, 10]","[9, 21]",[],[],[],[]
7957,"if you have continued fevers , worsening abdom...","[worsening abdominal pain, other worrisome sym...",[],[],"[22, 31, 58, 71]","[28, 55, 67, 95]",[],[],[],[]
7958,abdominal pain treatment rendered :,[abdominal pain],[],[],[0],[14],[],[],[],[]


We also add some lines that contain no concepts

In [15]:
# add lines with no concepts
NUMBER_LINE_PER_FILE = 4
no_concepts_df = []
# aggregate start_line by filename
list_lines = concept_df.groupby("filename")["start_line"].apply(set).to_dict()
for filename, line_nums in list_lines.items():
    # split file
    lines = df[df["filename"] == filename]["text"].values[0].split("\n")
    # get two random line that doesnt exist in line_nums
    line_num = np.random.choice(list(set(range(len(lines))) - line_nums), NUMBER_LINE_PER_FILE, replace=False)
    # add to no_concepts_df
    for l in line_num:
        if lines[l] != "":
            no_concepts_df.append({"filename": filename, "text": lines[l]})

no_concepts_df = pd.DataFrame(no_concepts_df)
for col in preproc_df:
    if col != "text":
        # initialize column of empty lists
        no_concepts_df[col] = [[] for _ in range(len(no_concepts_df))]
no_concepts_df


Unnamed: 0,filename,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,018636330_DH,She has no known drug allergy .,[],[],[],[],[],[],[],[],[]
1,018636330_DH,TR :,[],[],[],[],[],[],[],[],[]
2,018636330_DH,TD :,[],[],[],[],[],[],[],[],[]
3,018636330_DH,Discharge Summary,[],[],[],[],[],[],[],[],[]
4,026350193_RWH,DIS,[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
662,record-83,"5. Fluticasone 50 mcg / Actuation Aerosol , Sp...",[],[],[],[],[],[],[],[],[]
663,record-84,Service :,[],[],[],[],[],[],[],[],[]
664,record-84,1959-12-09,[],[],[],[],[],[],[],[],[]
665,record-84,PHYSICAL EXAMINATION ON ADMISSION :,[],[],[],[],[],[],[],[],[]


In [16]:
# add lines with no concepts to preproc_df
preproc_df = preproc_df.append(no_concepts_df.drop(columns=["filename"]), ignore_index=True)
preproc_df

Unnamed: 0,text,problem,test,treatment,problem_indices_start,problem_indices_end,test_indices_start,test_indices_end,treatment_indices_start,treatment_indices_end
0,go to an emergency room if you experience symp...,"[chills, fevers, new and continuing nausea, vo...",[],[],"[42, 96, 124, 135, 157, 169]","[50, 121, 132, 141, 163, 188]",[],[],[],[]
1,3. mitochondrial myopathy .,[mitochondrial myopathy],[],[],[3],[25],[],[],[],[]
2,justin searle a 60-year-old man with a long hi...,[chronic prostatitis],[],[two transurethral resection of the prostate],[55],[74],[],[],[87],[130]
3,2. supraventricular tachycardia ( on a beta bl...,[supraventricular tachycardia],[],[a beta blocker],[3],[31],[],[],[37],[51]
4,turp,[],[],[turp],[],[],[],[],[0],[4]
...,...,...,...,...,...,...,...,...,...,...
8622,"5. Fluticasone 50 mcg / Actuation Aerosol , Sp...",[],[],[],[],[],[],[],[],[]
8623,Service :,[],[],[],[],[],[],[],[],[]
8624,1959-12-09,[],[],[],[],[],[],[],[],[]
8625,PHYSICAL EXAMINATION ON ADMISSION :,[],[],[],[],[],[],[],[],[]


In [18]:
# save to JSON to then import into Dataset object
preproc_df.to_json(os.path.join(processed_data_path, "dataset-3.jsonl"), orient="records", lines=True)

In [20]:
dataset = load_dataset("json", data_files=os.path.join(processed_data_path, "dataset-3.jsonl"))
dataset

Using custom data configuration default-df6ad188268fb31f
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-df6ad188268fb31f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 8627
    })
})

In [21]:
# no train-test provided, so we create our own
dataset = dataset["train"].train_test_split(test_size =100)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 8527
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end'],
        num_rows: 100
    })
})

### Token Labeling

In [22]:
label_list = ['O', 'B-PROBLEM', 'I-PROBLEM', 'B-TEST', 'I-TEST', 'B-TREATMENT', 'I-TREATMENT']

custom_seq = Sequence(feature=ClassLabel(num_classes=len(label_list),
                                         names=label_list,
                                         names_file=None, id=None), length=-1, id=None)

dataset["train"].features["ner_tags"] = custom_seq
dataset["test"].features["ner_tags"] = custom_seq
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'ner_tags'],
        num_rows: 8527
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'ner_tags'],
        num_rows: 100
    })
})

In [23]:
from tqdm.notebook import tqdm
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [24]:
def generate_row_labels(row, verbose=False):
    """ Given a row from the consolidated `Ade_corpus_v2_drug_ade_relation` dataset, 
    generates BIO tags for drug and effect entities. 
    
    """

    text = row["text"]

    labels = []
    label = "O"
    prefix = ""
    
    # while iterating through tokens, increment to traverse all drug and effect spans
    problem_index = 0
    effect_index = 0
    test_index = 0
    treatment_index = 0
    
    tokens = tokenizer(text, return_offsets_mapping=True)

    for n in range(len(tokens["input_ids"])):
        offset_start, offset_end = tokens["offset_mapping"][n]

        # should only happen for [CLS] and [SEP]
        if offset_end - offset_start == 0:
            labels.append(-100)
            continue
        
        if problem_index < len(row["problem_indices_start"]) and offset_start == row["problem_indices_start"][problem_index]:
            label = "PROBLEM"
            prefix = "B-"

        elif test_index < len(row["test_indices_start"]) and offset_start == row["test_indices_start"][test_index]:
            label = "TEST"
            prefix = "B-"

        elif treatment_index < len(row["treatment_indices_start"]) and offset_start == row["treatment_indices_start"][treatment_index]:
            label = "TREATMENT"
            prefix = "B-"
        
        labels.append(label_list.index(f"{prefix}{label}"))
            
        if problem_index < len(row["problem_indices_end"]) and offset_end == row["problem_indices_end"][problem_index]:
            label = "O"
            prefix = ""
            problem_index += 1
            
        elif test_index < len(row["test_indices_end"]) and offset_end == row["test_indices_end"][test_index]:
            label = "O"
            prefix = ""
            test_index += 1

        elif treatment_index < len(row["treatment_indices_end"]) and offset_end == row["treatment_indices_end"][treatment_index]:
            label = "O"
            prefix = ""
            treatment_index += 1

        # need to transition "inside" if we just entered an entity
        if prefix == "B-":
            prefix = "I-"
    
    if verbose:
        pprint(row)
        orig = tokenizer.convert_ids_to_tokens(tokens["input_ids"])
        for n in range(len(labels)):
            print(orig[n], labels[n])
    tokens["labels"] = labels
    
    return tokens

In [25]:
# testing out...

generate_row_labels(dataset["train"][np.random.randint(0, len(dataset["train"]))], verbose=True)

{'problem': ['mild tenderness', 'a minor stepoff', 'continued ecchymosis'],
 'problem_indices_end': [56, 110, 130],
 'problem_indices_start': [36, 95, 115],
 'test': ['palpation', 'prior examinations', 'prior examination'],
 'test_indices_end': [89, 143, 177],
 'test_indices_start': [72, 134, 159],
 'text': 'the left clavicle was noted to have continued ecchymosis decreased '
         'from prior examination with a minor stepoff and mild tenderness to '
         'palpation decreased from prior examinations .',
 'treatment': [],
 'treatment_indices_end': [],
 'treatment_indices_start': []}
[CLS] -100
the 0
left 0
cla 0
##vic 0
##le 0
was 0
noted 0
to 0
have 0
continued 1
ecc 2
##hy 2
##mos 2
##is 2
decreased 0
from 0
prior 3
examination 4
with 0
a 1
minor 2
step 2
##off 2
and 0
mild 1
tender 2
##ness 2
to 0
palp 3
##ation 4
decreased 0
from 0
prior 3
examinations 4
. 0
[SEP] -100


{'input_ids': [102, 111, 2101, 3895, 9617, 143, 241, 3742, 147, 360, 6887, 14142, 1844, 14598, 129, 2664, 263, 1979, 4373, 190, 106, 5118, 1371, 2417, 137, 6173, 22647, 1076, 147, 21348, 150, 2664, 263, 1979, 12991, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (4, 8), (9, 12), (12, 15), (15, 17), (18, 21), (22, 27), (28, 30), (31, 35), (36, 45), (46, 49), (49, 51), (51, 54), (54, 56), (57, 66), (67, 71), (72, 77), (78, 89), (90, 94), (95, 96), (97, 102), (103, 107), (107, 110), (111, 114), (115, 119), (120, 126), (126, 130), (131, 133), (134, 138), (138, 143), (144, 153), (154, 158), (159, 164), (165, 177), (178, 179), (0, 0)], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 3, 4, 0, 1, 2, 2, 2, 0, 1, 2, 2, 0, 

In [26]:
labeled_dataset = dataset.map(generate_row_labels)
labeled_dataset

  0%|          | 0/8527 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 8527
    })
    test: Dataset({
        features: ['text', 'problem', 'test', 'treatment', 'problem_indices_start', 'problem_indices_end', 'test_indices_start', 'test_indices_end', 'treatment_indices_start', 'treatment_indices_end', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
        num_rows: 100
    })
})

### Electramed Model Fine-Tuning

In [31]:
# Local model
label_names = ["O", "B-PROBLEM", "I-PROBLEM", "B-TEST", "I-TEST", "B-TREATMENT", "I-TREATMENT"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), label2id=label2id, id2label=id2label)

loading configuration file https://huggingface.co/giacomomiolo/electramed_base_scivocab_1M/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/c482a39d4ee91eb23933abaff8eba8ae535cd80563e3b66caa41934b37d52138.1c949c4a68e30225f35ee7db82195ffff60db502f0c9b37ed1cc866f8708614a
Model config ElectraConfig {
  "_name_or_path": "giacomomiolo/electramed_base_scivocab_1M",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PROBLEM",
    "2": "I-PROBLEM",
    "3": "B-TEST",
    "4": "I-TEST",
    "5": "B-TREATMENT",
    "6": "I-TREATMENT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-PROBLEM": 1,
    "B-TEST": 3,
    "B-TREATMENT": 5,
    "I-PROBLEM": 2,
    "I-TEST": 4,
    "I-TREATMENT": 6,
    "O": 0
  },
  "layer_norm_

In [33]:
model_name = model_checkpoint.split("/")[-1]
model_folder_name = f"{model_name}-finetuned-{task}-3"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0,
    logging_steps=1
)
data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

In [35]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, test, test_indices_end, problem_indices_start, text, treatment_indices_end, problem_indices_end, treatment, test_indices_start, problem, treatment_indices_start.
***** Running training *****
  Num examples = 8527
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2665


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1619,0.327377,0.806452,0.796178,0.801282,0.918673
2,0.0848,0.301088,0.803797,0.808917,0.806349,0.931034
3,0.0293,0.315097,0.807453,0.828025,0.81761,0.927131
4,0.014,0.311621,0.805031,0.815287,0.810127,0.92648
5,0.0692,0.336684,0.832258,0.821656,0.826923,0.930384


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, test, test_indices_end, problem_indices_start, text, treatment_indices_end, problem_indices_end, treatment, test_indices_start, problem, treatment_indices_start.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, test, test_indices_end, problem_indices_start, text, treatment_indices_end, problem_indices_end, treatment, test_indices_start, problem, treatment_indices_start.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, test, test_indices_end, problem_indices_start, 

TrainOutput(global_step=2665, training_loss=0.14617769830750116, metrics={'train_runtime': 464.6976, 'train_samples_per_second': 91.748, 'train_steps_per_second': 5.735, 'total_flos': 1100612807215086.0, 'train_loss': 0.14617769830750116, 'epoch': 5.0})

In [37]:
# save model
trainer.save_model(f"models/{model_folder_name}")

Saving model checkpoint to models/electramed_base_scivocab_1M-finetuned-ner-3
Configuration saved in models/electramed_base_scivocab_1M-finetuned-ner-3/config.json
Model weights saved in models/electramed_base_scivocab_1M-finetuned-ner-3/pytorch_model.bin
tokenizer config file saved in models/electramed_base_scivocab_1M-finetuned-ner-3/tokenizer_config.json
Special tokens file saved in models/electramed_base_scivocab_1M-finetuned-ner-3/special_tokens_map.json


In [38]:
predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, test, test_indices_end, problem_indices_start, text, treatment_indices_end, problem_indices_end, treatment, test_indices_start, problem, treatment_indices_start.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'PROBLEM': {'f1': 0.8108108108108109,
  'number': 69,
  'precision': 0.759493670886076,
  'recall': 0.8695652173913043},
 'TEST': {'f1': 0.8292682926829269,
  'number': 45,
  'precision': 0.918918918918919,
  'recall': 0.7555555555555555},
 'TREATMENT': {'f1': 0.8536585365853658,
  'number': 43,
  'precision': 0.8974358974358975,
  'recall': 0.813953488372093},
 'overall_accuracy': 0.9303838646714379,
 'overall_f1': 0.8269230769230769,
 'overall_precision': 0.832258064516129,
 'overall_recall': 0.821656050955414}

---
## See Model Outputs

We load our fine-tuned model into a `pipeline` object to run arbitrary input against it.

In [39]:
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device=0)

In [40]:
def visualize_entities(sentence):
    tokens = effect_ner_model(sentence)
    entities = []

    for token in tokens:
        label = int(token["entity"][-1])
        if label != 0:
            token["label"] = label_list[label]
            entities.append(token)

    params = [{"text": sentence, "ents": entities, "title": None}]

    html = displacy.render(
        params,
        style="ent",
        manual=True,
        jupyter=True,
        options={
            "colors": {
                "B-PROBLEM": "#f08080",
                "I-PROBLEM": "#f08080",
                "B-TEST": "#9bddff",
                "I-TEST": "#9bddff",
                "B-TREATMENT": "#ffdab9",
                "I-TREATMENT": "#ffdab9",
            },
        },
    )


In [42]:
# pick 5 random sentences from the test set
for i in range(5):
    index = np.random.randint(0, len(labeled_dataset["test"]))
    visualize_entities(labeled_dataset["test"][index]["text"])
    print(f"Text: {labeled_dataset['test'][index]['text']}")
    print(f"Problems: {labeled_dataset['test'][index]['problem']}")
    print(f"Tests: {labeled_dataset['test'][index]['test']}")
    print(f"Treatments: {labeled_dataset['test'][index]['treatment']}")
    print(f"{'*' * 50}\n")

ValueError: ignored

## Evaluate the model

In [54]:

# Local model
label_names = ["O", "B-PROBLEM", "I-PROBLEM", "B-TEST", "I-TEST", "B-TREATMENT", "I-TREATMENT"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model_checkpoint = f"models/{model_folder_name}"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer, device = 0)


loading configuration file models/electramed_base_scivocab_1M-finetuned-ner-3/config.json
Model config ElectraConfig {
  "_name_or_path": "models/electramed_base_scivocab_1M-finetuned-ner-3",
  "architectures": [
    "ElectraForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PROBLEM",
    "2": "I-PROBLEM",
    "3": "B-TEST",
    "4": "I-TEST",
    "5": "B-TREATMENT",
    "6": "I-TREATMENT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-PROBLEM": 1,
    "B-TEST": 3,
    "B-TREATMENT": 5,
    "I-PROBLEM": 2,
    "I-TEST": 4,
    "I-TREATMENT": 6,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute"

In [55]:
trainer = Trainer(
    model,
    args,
    train_dataset=labeled_dataset["train"],
    eval_dataset=labeled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

predictions, labels, _ = trainer.predict(labeled_dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `ElectraForTokenClassification.forward` and have been ignored: offset_mapping, test, test_indices_end, problem_indices_start, text, treatment_indices_end, problem_indices_end, treatment, test_indices_start, problem, treatment_indices_start.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'PROBLEM': {'f1': 0.8108108108108109,
  'number': 69,
  'precision': 0.759493670886076,
  'recall': 0.8695652173913043},
 'TEST': {'f1': 0.8292682926829269,
  'number': 45,
  'precision': 0.918918918918919,
  'recall': 0.7555555555555555},
 'TREATMENT': {'f1': 0.8536585365853658,
  'number': 43,
  'precision': 0.8974358974358975,
  'recall': 0.813953488372093},
 'overall_accuracy': 0.9303838646714379,
 'overall_f1': 0.8269230769230769,
 'overall_precision': 0.832258064516129,
 'overall_recall': 0.821656050955414}

In [56]:
text_files = glob.glob(val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
df = df[df.text != ""]
# reset index
df = df.reset_index(drop=True)
df

  0%|          | 0/128 [00:00<?, ?it/s]

Unnamed: 0,text,filename,line_number
0,006544894,0006,0
1,NVH,0006,1
2,65104826,0006,2
3,1/2/2004 12:00:00 AM,0006,3
4,"VT s/p cardiac cath , stent and amp ; amio loa...",0006,4
...,...,...,...
13545,03/06/97,0475,128
13546,TD :,0475,129
13547,03/10/97 2:14 P,0475,130
13548,cc :,0475,131


In [57]:
lines = df["text"].values
outputs = effect_ner_model(list(lines), aggregation_strategy ="simple") #TODO: add verbose

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [60]:
os.mkdir ("predictions")

In [61]:
# save outputs as pkl
import joblib
predictions_path = "predictions"
joblib.dump(outputs, os.path.join(predictions_path, "outputs-3-val.pkl"))

['predictions/outputs-3-val.pkl']

In [63]:
import joblib
outputs = joblib.load(os.path.join(predictions_path, "outputs-3-val.pkl"))
df["outputs"] = outputs
df

Unnamed: 0,text,filename,line_number,outputs
0,006544894,0006,0,[]
1,NVH,0006,1,[]
2,65104826,0006,2,[]
3,1/2/2004 12:00:00 AM,0006,3,[]
4,"VT s/p cardiac cath , stent and amp ; amio loa...",0006,4,"[{'entity_group': 'PROBLEM', 'score': 0.991923..."
...,...,...,...,...
13545,03/06/97,0475,128,[]
13546,TD :,0475,129,[]
13547,03/10/97 2:14 P,0475,130,[]
13548,cc :,0475,131,[]


In [64]:
# for each file create <filename>.con
os.makedirs(val_data_path + os.sep + concept_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(val_data_path + os.sep + concept_folder_name + os.sep + "*.con")
for file in files:
    os.remove(file)

for i, row in tqdm(df.iterrows()):
    filename = row["filename"]
    text = row["text"]
    with open(val_data_path + os.sep + concept_folder_name + os.sep + filename + ".con", "a") as f:
        for output in row["outputs"]:
            line_num = row["line_number"] + 1
            # switch char indexes to word indexes
            start_char_idx = output["start"] # first char index
            end_char_idx = output["end"] # last char index
            start_word_idx = text[:start_char_idx].count(" ")
            end_word_idx = text[:end_char_idx].count(" ")
            # fill like this c="a workup" 27:2 27:3||t="test"
            f.write(
                f"c=\"{output['word']}\" {line_num}:{start_word_idx} {line_num}:{end_word_idx}||t=\"{output['entity_group'].lower()}\"\n"
            )
    


0it [00:00, ?it/s]

In [67]:
!zip -r electamed_val_concept.zip /content/drive/MyDrive/medical_txt_parser/data/val/concept 

updating: content/drive/MyDrive/medical_txt_parser/data/val/concept/ (stored 0%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0006.con (deflated 70%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0013.con (deflated 66%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0017.con (deflated 69%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0018.con (deflated 68%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0022.con (deflated 69%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0025.con (deflated 65%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0030.con (deflated 64%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0033.con (deflated 70%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0034.con (deflated 66%)
  adding: content/drive/MyDrive/medical_txt_parser/data/val/concept/0038.con (deflated 68%)