[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Relations classification

Based of: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_xnli.py

In [1]:
!nvidia-smi

Wed Jan 26 14:57:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   42C    P0    27W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [17]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

ModuleNotFoundError: No module named 'google.colab'

In [2]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import random

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import datasets
import numpy as np
from datasets import load_dataset, load_metric , Dataset
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from transformers import pipeline

require_version("datasets>=1.8.0", "To fix: pip install --upgrade datasets")

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

/home/jupyter/medical_txt_parser/src/notebooks
/home/jupyter/medical_txt_parser/src
/home/jupyter/medical_txt_parser


In [18]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
nli_data_path = "data/nli"
re_data_path = "data/re"

# model args
model_name_or_path = "allenai/scibert_scivocab_uncased" # "gsarti/scibert-nli"
cache_dir = None
model_revision = None 
tokenizer_name = model_name_or_path
do_lower_case = None
use_fast_tokenizer = True
fp16 = True

# data args
pad_to_max_length = None
max_seq_length = None

set_seed(42)


### Import data

In [19]:
relations_df = pd.read_csv(re_data_path + os.sep + "re_scibert_data.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,<< C5-6 disc herniation >> with [[ cord compre...,3
1,<< C5-6 disc herniation >> with cord compressi...,3
2,[[ C5-6 disc herniation ]] with << cord compre...,0
3,C5-6 disc herniation with << cord compression ...,0
4,[[ C5-6 disc herniation ]] with cord compressi...,0
...,...,...
15338,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
15339,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
15340,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
15341,The patient was told he could return to work a...,2


In [5]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=0.2, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 12274
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 3069
 }))

In [6]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 0    9780
1     793
2     707
3     604
4     147
5     133
6      50
7      41
8      19
Name: label, dtype: int64
val_df labels: 0    2445
1     199
2     177
3     151
4      37
5      33
6      12
7      10
8       5
Name: label, dtype: int64


In [7]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [8]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/13 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Sample 10476 of the training set: {'text': 'We recommend daily physical therapy and occupational therapy to maximize [[ functional capacity ]] with optimization of pharmacological regimen for spasticity including << Baclofen >> , Valium and adequate analgesia .', 'label': 0, 'input_ids': [102, 185, 4463, 4122, 2121, 2223, 137, 11069, 2223, 147, 9889, 260, 260, 2131, 2900, 1901, 1901, 190, 3378, 131, 10559, 11685, 168, 19756, 20177, 208, 1471, 962, 962, 7555, 28341, 1374, 1374, 422, 491, 888, 137, 6443, 19539, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 1824 of the training set: {'text': 'If you develop trouble breathing , << worsening pain >> , vomiting or [[ any other concerns ]] please return to the ED for further evaluation .

In [9]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [10]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [11]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

In [13]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

class_weights = [len(train_df)/ (len(train_df[train_df["label"] == i])*len(id2label)) for i in id2label.keys()]
class_weights = torch.tensor(class_weights).log1p()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [14]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-1"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [18]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 12274
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1920


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,1.8127,1.747089,0.229627,0.228494,0.341526,0.536657
2,1.5157,0.95037,0.426305,0.382462,0.641518,0.617465
3,0.4316,0.747373,0.519523,0.451177,0.771378,0.655914
4,0.5179,0.733136,0.582533,0.551681,0.763623,0.748778
5,0.242,0.711711,0.594272,0.524502,0.794992,0.703161
6,0.2069,0.684572,0.617967,0.538753,0.808513,0.769958
7,0.2417,0.732128,0.627741,0.551357,0.793295,0.786901
8,0.1666,0.778358,0.633295,0.560561,0.782173,0.79798
9,0.144,0.755815,0.643564,0.565927,0.802104,0.811013
10,0.0753,0.767523,0.645641,0.570303,0.799726,0.805474


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3069
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3069
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3069
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3069
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClas

In [19]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-1
Configuration saved in models/scibert_scivocab_uncased-re-1/config.json
Model weights saved in models/scibert_scivocab_uncased-re-1/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-1/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-1/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  total_flos               = 12417534GF
  train_loss               =     0.5999
  train_runtime            = 0:44:15.85
  train_samples            =      12274
  train_samples_per_second =     46.215
  train_steps_per_second   =      0.723


In [20]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3069
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.8055
  eval_f1                 =     0.6456
  eval_loss               =     0.7675
  eval_precision          =     0.5703
  eval_recall             =     0.7997
  eval_runtime            = 0:00:23.45
  eval_samples            =       3069
  eval_samples_per_second =    130.821
  eval_steps_per_second   =      2.046


In [21]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 3069
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.98      0.78      0.87      2445
        TeRP       0.66      0.95      0.78       199
        TrAP       0.61      0.90      0.73       177
         PIP       0.31      0.86      0.46       151
        TrCP       0.53      0.84      0.65        37
        TeCP       0.51      0.73      0.60        33
       TrNAP       0.77      0.83      0.80        12
        TrIP       0.26      0.70      0.38        10
        TrWP       0.50      0.60      0.55         5

    accuracy                           0.81      3069
   macro avg       0.57      0.80      0.65      3069
weighted avg       0.89      0.81      0.83      3069



In [22]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 12274
  Batch size = 64


              precision    recall  f1-score   support

       Other       1.00      0.83      0.91      9780
        TeRP       0.76      1.00      0.86       793
        TrAP       0.69      0.99      0.81       707
         PIP       0.40      0.99      0.57       604
        TrCP       0.61      1.00      0.76       147
        TeCP       0.66      1.00      0.79       133
       TrNAP       0.75      0.98      0.85        50
        TrIP       0.57      1.00      0.73        41
        TrWP       0.68      1.00      0.81        19

    accuracy                           0.86     12274
   macro avg       0.68      0.98      0.79     12274
weighted avg       0.92      0.86      0.88     12274



## Evaluate the model

In [20]:
model_folder_name

'scibert_scivocab_uncased-re-1'

In [21]:
!ls models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
re					       scibert_scivocab_cased-re-2
scibert-nli-ast-clf			       scibert_scivocab_cased-re-3
scibert-nli-ast-clf-1			       scibert_scivocab_uncased-re-1
scibert-nli-finetuned-nli		       scibert_scivocab_uncased-re-P_P
scibert-nli-finetuned-nli-2022-01-20_20-58-00  scibert_scivocab_uncased-re-Te_P
scibert-nli-finetuned-nli-2022-01-20_21-38-12  scibert_scivocab_uncased-re-Tr_P
scibert_scivocab_cased-re-1


In [23]:
# Local model
# label_list = ['TeCP', 'TrIP', 'TrNAP', 'TrAP', 'TrCP', 'TrWP', 'TeRP', 'PIP']

# id2label = {i: label for i, label in enumerate(label_list)}
# label2id = {v: k for k, v in id2label.items()}

model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-1"
model_checkpoint = f"models/{model_folder_name}"
model = AutoModelForSequenceClassification.from_pretrained(f"models/{model_folder_name}", label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(f"models/{model_folder_name}")

# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None
    

trainer = Trainer(
    model=model,
    # args=args,
    # train_dataset=train_dataset ,
    # eval_dataset=eval_dataset ,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

loading configuration file models/scibert_scivocab_uncased-re-1/config.json
Model config BertConfig {
  "_name_or_path": "models/scibert_scivocab_uncased-re-1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "re",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Other",
    "1": "TeRP",
    "2": "TrAP",
    "3": "PIP",
    "4": "TrCP",
    "5": "TeCP",
    "6": "TrNAP",
    "7": "TrIP",
    "8": "TrWP"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Other": 0,
    "PIP": 3,
    "TeCP": 5,
    "TeRP": 1,
    "TrAP": 2,
    "TrCP": 4,
    "TrIP": 7,
    "TrNAP": 6,
    "TrWP": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_typ

In [26]:
val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt"

'data/val/txt/*.txt'

In [28]:
val_data_path + os.sep + concept_folder_name + os.sep + fname + ".con"

'data/val/concept/0006.con'

In [37]:
test_data_path = "data/test"

In [38]:
text_files = glob.glob(test_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
# df = df[df.text != ""]
# # reset index
# df = df.reset_index(drop=True)

# add concepts
rel_df = pd.DataFrame()
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(test_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    
    concept_df = pd.DataFrame(concept_dict).drop(columns=["end_line"])
    test_concept_df = concept_df[concept_df["concept_type"] == "test"]
    problem_concept_df = concept_df[concept_df["concept_type"] == "problem"]
    treatment_concept_df = concept_df[concept_df["concept_type"] == "treatment"]

    # class test --> problem
    test_problem_df = pd.merge(test_concept_df, problem_concept_df, how="inner", on="start_line")

    # class treatment --> problem
    treatment_problem_df = pd.merge(treatment_concept_df, problem_concept_df, how="inner", on="start_line")

    # class problem --> problem
    problem_problem_df = pd.merge(problem_concept_df, problem_concept_df, how="inner", on="start_line")
    problem_problem_df = problem_problem_df[problem_problem_df["concept_text_x"] != problem_problem_df["concept_text_y"]] # TODO: remove duplicates ?

    tmp = pd.concat([test_problem_df, treatment_problem_df, problem_problem_df], axis=0)
    tmp["filename"] = fname
    rel_df = rel_df.append(tmp, ignore_index=True)
            
rel_df = rel_df.sort_values(by=["filename", "start_line"])
rel_df = rel_df.reset_index(drop=True)

100%|██████████| 128/128 [00:00<00:00, 1055.43it/s]
100%|██████████| 128/128 [00:01<00:00, 81.54it/s]


In [39]:
rel_df

Unnamed: 0,concept_text_x,start_line,start_word_number_x,end_word_number_x,concept_type_x,concept_text_y,start_word_number_y,end_word_number_y,concept_type_y,filename
0,Mesenteric angiograpm,18,0,1,treatment,bleeding vessel,6,7,problem,0001
1,coil embolization,18,3,4,treatment,bleeding vessel,6,7,problem,0001
2,cabg,22,13,13,treatment,dm2,9,9,problem,0001
3,cabg,22,13,13,treatment,cad,11,11,problem,0001
4,cabg,22,13,13,treatment,DVT,15,15,problem,0001
...,...,...,...,...,...,...,...,...,...,...
18127,Saline wet to dry dressing,109,0,4,treatment,penis and pelvis decubiti,10,13,problem,0477
18128,40% humidified oxygen,109,15,17,treatment,penis and pelvis decubiti,10,13,problem,0477
18129,Fluconazole,109,19,19,treatment,penis and pelvis decubiti,10,13,problem,0477
18130,a surgical procedure,132,18,20,treatment,his constrictive pericarditis,13,15,problem,0477


In [40]:
rel_df = rel_df[[ "filename", "start_line", "concept_text_x", "concept_text_y", "concept_type_x", "concept_type_y", "start_word_number_x", "end_word_number_x", "start_word_number_y", "end_word_number_y"]]
rel_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y
0,0001,18,Mesenteric angiograpm,bleeding vessel,treatment,problem,0,1,6,7
1,0001,18,coil embolization,bleeding vessel,treatment,problem,3,4,6,7
2,0001,22,cabg,dm2,treatment,problem,13,13,9,9
3,0001,22,cabg,cad,treatment,problem,13,13,11,11
4,0001,22,cabg,DVT,treatment,problem,13,13,15,15
...,...,...,...,...,...,...,...,...,...,...
18127,0477,109,Saline wet to dry dressing,penis and pelvis decubiti,treatment,problem,0,4,10,13
18128,0477,109,40% humidified oxygen,penis and pelvis decubiti,treatment,problem,15,17,10,13
18129,0477,109,Fluconazole,penis and pelvis decubiti,treatment,problem,19,19,10,13
18130,0477,132,a surgical procedure,his constrictive pericarditis,treatment,problem,18,20,13,15


In [41]:
# make predict dataset
def preprocess_text(row):
    # find line
    line =  df[(df["filename"] == row["filename"]) & (df["line_number"] == row["start_line"]-1)]["text"].values[0]
    # line = line.lower()
    line = " ".join(line.split()) # remove multiple spaces

    concept_text_x = "<< "+ " ".join(line.split()[row["start_word_number_x"]:row["end_word_number_x"]+1]) + " >>"
    concept_text_y = "[[ " + " ".join(line.split()[row["start_word_number_y"]:row["end_word_number_y"]+1]) + " ]]"
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]

    if row["start_word_number_x"] > row["start_word_number_y"]:
        concept_text_x, concept_text_y = concept_text_y, concept_text_x
        start_word_number_x, start_word_number_y = start_word_number_y, start_word_number_x
        end_word_number_x, end_word_number_y = end_word_number_y, end_word_number_x
    text = " ".join(line.split()[: start_word_number_x] + [concept_text_x] + line.split()[end_word_number_x+1: start_word_number_y] + [concept_text_y] + line.split()[end_word_number_y+1:])

    row["text"] = text
    return row

predict_df = rel_df.apply(preprocess_text, axis=1)
predict_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,text
0,0001,18,Mesenteric angiograpm,bleeding vessel,treatment,problem,0,1,6,7,<< Mesenteric angiograpm >> w/ coil embolizati...
1,0001,18,coil embolization,bleeding vessel,treatment,problem,3,4,6,7,Mesenteric angiograpm w/ << coil embolization ...
2,0001,22,cabg,dm2,treatment,problem,13,13,9,9,"HPI: Pt is a 71 y/o male with h/o [[ dm2 ]] , ..."
3,0001,22,cabg,cad,treatment,problem,13,13,11,11,"HPI: Pt is a 71 y/o male with h/o dm2 , [[ cad..."
4,0001,22,cabg,DVT,treatment,problem,13,13,15,15,"HPI: Pt is a 71 y/o male with h/o dm2 , cad s/..."
...,...,...,...,...,...,...,...,...,...,...,...
18127,0477,109,Saline wet to dry dressing,penis and pelvis decubiti,treatment,problem,0,4,10,13,<< Saline wet to dry dressing >> changes three...
18128,0477,109,40% humidified oxygen,penis and pelvis decubiti,treatment,problem,15,17,10,13,Saline wet to dry dressing changes three times...
18129,0477,109,Fluconazole,penis and pelvis decubiti,treatment,problem,19,19,10,13,Saline wet to dry dressing changes three times...
18130,0477,132,a surgical procedure,his constrictive pericarditis,treatment,problem,18,20,13,15,The patient was seen at Ph University Of Medic...


In [42]:
predict_dataset = Dataset.from_pandas(predict_df, preserve_index=False)
# predict_dataset = predict_dataset.select(range(10))
predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on prediction dataset",
            )
predict_dataset

Running tokenizer on prediction dataset:   0%|          | 0/19 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['filename', 'start_line', 'concept_text_x', 'concept_text_y', 'concept_type_x', 'concept_type_y', 'start_word_number_x', 'end_word_number_x', 'start_word_number_y', 'end_word_number_y', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18132
})

In [43]:
 predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
 predictions = np.argmax(predictions, axis=1)
 len(predictions)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: end_word_number_y, start_word_number_x, concept_type_x, start_line, concept_type_y, start_word_number_y, text, concept_text_y, concept_text_x, end_word_number_x, filename.
***** Running Prediction *****
  Num examples = 18132
  Batch size = 16


18132

In [44]:
rel_df["prediction"] = [id2label[label] for label in predictions]
rel_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,prediction
0,0001,18,Mesenteric angiograpm,bleeding vessel,treatment,problem,0,1,6,7,TeRP
1,0001,18,coil embolization,bleeding vessel,treatment,problem,3,4,6,7,TrAP
2,0001,22,cabg,dm2,treatment,problem,13,13,9,9,Other
3,0001,22,cabg,cad,treatment,problem,13,13,11,11,TrAP
4,0001,22,cabg,DVT,treatment,problem,13,13,15,15,Other
...,...,...,...,...,...,...,...,...,...,...,...
18127,0477,109,Saline wet to dry dressing,penis and pelvis decubiti,treatment,problem,0,4,10,13,TrAP
18128,0477,109,40% humidified oxygen,penis and pelvis decubiti,treatment,problem,15,17,10,13,TrAP
18129,0477,109,Fluconazole,penis and pelvis decubiti,treatment,problem,19,19,10,13,TrAP
18130,0477,132,a surgical procedure,his constrictive pericarditis,treatment,problem,18,20,13,15,TrAP


In [45]:
# for each file create <filename>.con
os.makedirs(test_data_path + os.sep + rel_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(test_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
for file in files:
    os.remove(file)

for i, row in tqdm(rel_df.iterrows()):
    filename = row["filename"]
    concept_text_x = row["concept_text_x"]
    concept_text_y = row["concept_text_y"]
    concept_type_x = row["concept_type_x"]
    concept_type_y = row["concept_type_y"]
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]
    line_number = row["start_line"]
    prediction = row["prediction"]
    if prediction != "Other":
        with open(test_data_path + os.sep + rel_folder_name + os.sep + filename + ".rel", "a") as f:
            # fill like this c="pefusion imaging" 19:6 19:7||r="TeRP"||c="perfusion defects" 19:12 19:13
            f.write(
                f"c=\"{concept_text_x}\" {line_number}:{start_word_number_x} {line_number}:{end_word_number_x}||r=\"{prediction}\"||c=\"{concept_text_y}\" {line_number}:{start_word_number_y} {line_number}:{end_word_number_y}\n"
            )
    


18132it [00:01, 12462.35it/s]


In [46]:
rel_files = glob.glob(test_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
rel_files = [f.split(os.sep)[-1][:-4] for f in rel_files]
txt_files = [f.split(os.sep)[-1][:-4] for f in text_files]
# find missing files
missing_files = set(txt_files) - set(rel_files)
missing_files


{'0046', '0114', '0154', '0182', '0226', '0265', '0274', '0413'}

In [47]:
# create empty files for missing files
for f in missing_files:
    with open(test_data_path + os.sep + rel_folder_name + os.sep + f + ".rel", "w") as f:
        f.write("")

In [48]:
rel_files = glob.glob(test_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
rel_files = [f.split(os.sep)[-1][:-4] for f in rel_files]
txt_files = [f.split(os.sep)[-1][:-4] for f in text_files]
# find missing files
missing_files = set(txt_files) - set(rel_files)
missing_files

set()

In [33]:
!cat data/val/rel/0006.rel

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
c="cardiac cath" 5:2 5:3||r="TrAP"||c="vt" 5:0 5:0
c="stent" 5:5 5:5||r="TrAP"||c="vt" 5:0 5:0
c="amp" 5:7 5:7||r="TrAP"||c="vt" 5:0 5:0
c="amio loading" 5:9 5:10||r="TrAP"||c="vt" 5:0 5:0
c="cath" 94:16 94:16||r="TrAP"||c="vt" 94:11 94:11
c="cardiac" 94:15 94:15||r="TrAP"||c="vt" 94:11 94:11
c="stent" 94:18 94:18||r="TrAP"||c="vt" 94:11 94:11
c="amio loading" 94:20 94:21||r="TrAP"||c="vt" 94:11 94:11
c="bilateral knee replacement" 95:22 95:24||r="TrAP"||c="mi ischemic cardiomyopathy" 95:19 95:21
c="cabg" 100:8 100:8||r="TrAP"||c="longstanding cad" 100:3 100:4
c="cabg" 100:8 100:8||r="TrAP"||c="presyncope" 100:21 100:21
c="aicd" 100:14 100:14||r="TrAP"||c="longstanding cad" 100:3 100:4
c="aicd" 100:14 100:14

In [49]:
!zip -r scibert-test-rel-2.zip data/test/rel/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: data/test/rel/ (stored 0%)
  adding: data/test/rel/0233.rel (deflated 69%)
  adding: data/test/rel/0463.rel (deflated 74%)
  adding: data/test/rel/0427.rel (deflated 79%)
  adding: data/test/rel/0058.rel (deflated 77%)
  adding: data/test/rel/0460.rel (deflated 78%)
  adding: data/test/rel/0265.rel (stored 0%)
  adding: data/test/rel/0214.rel (deflated 70%)
  adding: data/test/rel/0021.rel (deflated 65%)
  adding: data/test/rel/0415.rel (deflated 53%)
  adding: data/test/rel/0005.rel (deflated 72%)
  adding: data/test/rel/0281.rel (deflated 76%)
  adding: data/test/rel/0037.rel (deflated 75%)
  adding: data/test/rel/0129.rel (deflated 71%)
  adding: data/test/rel/0462.rel (deflated 65%)
  adding: d