[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Relations classification

Based of: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_xnli.py

In [1]:
!nvidia-smi

Wed Jan 26 16:50:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   55C    P0    29W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [17]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

ModuleNotFoundError: No module named 'google.colab'

In [6]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import random

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import datasets
import numpy as np
from datasets import load_dataset, load_metric , Dataset
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from transformers import pipeline

require_version("datasets>=1.8.0", "To fix: pip install --upgrade datasets")

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

/home/jupyter/medical_txt_parser/src/notebooks
/home/jupyter/medical_txt_parser/src
/home/jupyter/medical_txt_parser


In [7]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
nli_data_path = "data/nli"
re_data_path = "data/re"

# model args
model_name_or_path = "allenai/scibert_scivocab_uncased" # "gsarti/scibert-nli"  "allenai/scibert_scivocab_uncased"  "models/scibert_scivocab_uncased-re-1"
cache_dir = None
model_revision = None 
tokenizer_name = model_name_or_path
do_lower_case = None
use_fast_tokenizer = True
fp16 = True

# data args
pad_to_max_length = None
max_seq_length = None

set_seed(42)


## Model Test - Problem

### Import data

In [5]:
re_task = "Te_P"

In [6]:
relations_df = pd.read_csv(re_data_path + os.sep + f"re_scibert_data_{re_task}.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,She had << a workup >> by her neurologist and ...,0
1,She had << a workup >> by her neurologist and ...,0
2,She had << a workup >> by her neurologist and ...,0
3,She had a workup by her neurologist and << an ...,1
4,She had a workup by her neurologist and << an ...,1
...,...,...
2145,The patient had << an echocardiogram >> on day...,1
2146,The patient had << an echocardiogram >> on day...,1
2147,The patient had << an echocardiogram >> on day...,1
2148,The patient had << an echocardiogram >> on day...,1


In [10]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=10, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 2140
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 10
 }))

In [11]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 1    988
0    987
2    165
Name: label, dtype: int64
val_df labels: 0    5
1    4
2    1
Name: label, dtype: int64


In [12]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [13]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Sample 456 of the training set: {'text': 'Ext : warm , 2+ << DP >> b/l , [[ 2+ pitting edema to knees b/l ]]', 'label': 0, 'input_ids': [102, 1267, 862, 8591, 422, 170, 473, 962, 962, 6769, 1374, 1374, 132, 1352, 152, 422, 260, 260, 170, 473, 5304, 586, 12987, 147, 8710, 30113, 132, 1352, 152, 1901, 1901, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 102 of the training set: {'text': '<< An ultrasound of the right upper quadrant >> did not reveal any cholelithiasis or cholecystitis , however , an irregular hepatic contour was seen which is suggestive of [[ underlying chronic liver disease ]] .', 'label': 0, 'input_ids': [102, 962, 962, 130, 7801, 131, 111, 2083, 3105, 21337, 1374, 1374, 1544, 302, 2303, 843, 8104, 19478, 19851, 3353, 234, 25662, 3626, 422, 694, 422, 130, 11147, 7221, 1033

In [14]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [15]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [16]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

### Training

In [17]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=list(id2label.keys()),y=train_df["label"])
class_weights = torch.tensor(class_weights).log1p()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [18]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}-1"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [19]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2140
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 340


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.8036,0.726446,0.666667,0.694444,0.783333,0.7
2,0.5234,0.530641,0.666667,0.694444,0.783333,0.7
3,0.4371,0.34833,0.768519,0.75,0.85,0.8
4,0.4963,0.290834,0.85,0.85,0.85,0.8
5,0.156,0.350409,0.922078,0.944444,0.916667,0.9
6,0.1558,0.387758,0.833333,0.904762,0.833333,0.8
7,0.0279,0.351495,0.922078,0.944444,0.916667,0.9
8,0.1444,0.38771,0.922078,0.944444,0.916667,0.9
9,0.0321,0.369964,0.922078,0.944444,0.916667,0.9
10,0.0088,0.40646,0.922078,0.944444,0.916667,0.9


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassificati

In [20]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-Te_P-1
Configuration saved in models/scibert_scivocab_uncased-re-Te_P-1/config.json
Model weights saved in models/scibert_scivocab_uncased-re-Te_P-1/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-Te_P-1/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-Te_P-1/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  total_flos               =  1280704GF
  train_loss               =     0.3096
  train_runtime            = 0:04:23.37
  train_samples            =       2140
  train_samples_per_second =     81.253
  train_steps_per_second   =      1.291


In [21]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =        0.9
  eval_f1                 =     0.9221
  eval_loss               =     0.4065
  eval_precision          =     0.9444
  eval_recall             =     0.9167
  eval_runtime            = 0:00:00.10
  eval_samples            =         10
  eval_samples_per_second =     97.977
  eval_steps_per_second   =      9.798


In [22]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 10
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.83      1.00      0.91         5
        TeRP       1.00      0.75      0.86         4
        TeCP       1.00      1.00      1.00         1

    accuracy                           0.90        10
   macro avg       0.94      0.92      0.92        10
weighted avg       0.92      0.90      0.90        10



In [37]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1720
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.95      0.82      0.88       793
        TeRP       0.89      0.94      0.91       794
        TeCP       0.71      1.00      0.83       133

    accuracy                           0.89      1720
   macro avg       0.85      0.92      0.87      1720
weighted avg       0.90      0.89      0.89      1720



## Model Treatment - Problem

### Import data

In [23]:
re_task = "Tr_P"

In [25]:
import torch
# empty cuda
torch.cuda.empty_cache()

In [26]:
relations_df = pd.read_csv(re_data_path + os.sep + f"re_scibert_data_{re_task}.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,She had a postoperative CT scan that revealed ...,0
1,[[ Her pain ]] was under good control with << ...,4
2,"3. << Percocet >> , 5/325 , 1-2 tabs PO q4-6h ...",1
3,Take << codeine >> prescribed by PCP with food...,0
4,Take << codeine >> prescribed by PCP with food...,0
...,...,...
2904,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
2905,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
2906,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
2907,The patient was told he could return to work a...,1


In [42]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=10, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 2899
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 10
 }))

In [43]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 0    1698
1     881
2     183
3      62
4      51
5      24
Name: label, dtype: int64
val_df labels: 0    6
1    3
2    1
Name: label, dtype: int64


In [29]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

loading configuration file https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json from cache at /home/jupyter/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff9e9f998b9a9d40647a2148a202e3fb3d568dc0f170dda9dda194bab4d5dd
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "re",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Other",
    "1": "TrAP",
    "2": "TrCP",
    "3": "TrNAP",
    "4": "TrIP",
    "5": "TrWP"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Other": 0,
    "TrAP": 1,
    "TrCP": 2,
    "TrIP": 4,
    "TrNAP": 3,
    "TrWP": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_i

In [30]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Sample 2619 of the training set: {'text': '<< placement >> for [[ a bile leak from the duct of Luschka ]] .', 'label': 1, 'input_ids': [102, 962, 962, 8005, 1374, 1374, 168, 260, 260, 106, 11994, 7254, 263, 111, 12794, 131, 26672, 255, 3776, 1901, 1901, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 456 of the training set: {'text': '7) GI : Ms. Pimental is an unfortunate 95 year old woman , status post a recent fall necessitating right ORIF of the hip and [[ right wrist fracture ]] << ORIF >> , who was in the hospital for this event , when it was noted that she had abdominal pain , distention and coffee ground emesis with worsening mental status and renal failure .', 'label': 1, 'input_ids': [102, 450, 546, 4706, 862, 2400, 205, 24464, 1092, 165, 130, 7878, 28649, 5508, 996, 4289, 10221, 422, 2726, 1422, 106, 2151, 3913, 1956, 17642, 20

In [31]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [32]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [33]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

### Training

In [36]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

class_weights = [len(train_df)/ (len(train_df[train_df["label"] == i])*len(id2label)) for i in id2label.keys()]
# apply log to weights
class_weights = torch.tensor(class_weights).log1p()
print(class_weights)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

tensor([0.2504, 0.4372, 1.2921, 2.1740, 2.3489, 3.0508])


In [37]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}-1"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [38]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2899
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 460


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,1.1988,1.019472,0.766234,0.766667,0.777778,0.7
2,0.8028,0.658054,0.542857,0.5625,0.541667,0.6
3,0.3928,0.515419,0.592857,0.625,0.583333,0.7
4,0.2224,0.605726,0.643939,0.666667,0.625,0.8
5,0.3866,0.413809,0.592857,0.625,0.583333,0.7
6,0.2857,0.64296,0.541667,0.6,0.541667,0.6
7,0.1899,0.489113,0.643939,0.666667,0.625,0.8
8,0.2993,0.421152,0.643939,0.666667,0.625,0.8
9,0.0594,0.448395,0.643939,0.666667,0.625,0.8
10,0.0412,0.488282,0.643939,0.666667,0.625,0.8


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassificati

In [39]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-Tr_P-1
Configuration saved in models/scibert_scivocab_uncased-re-Tr_P-1/config.json
Model weights saved in models/scibert_scivocab_uncased-re-Tr_P-1/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-Tr_P-1/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-Tr_P-1/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  total_flos               =  3631377GF
  train_loss               =     0.5157
  train_runtime            = 0:11:21.94
  train_samples            =       2899
  train_samples_per_second =     42.511
  train_steps_per_second   =      0.675


In [44]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64


*** Evaluate ***


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [41]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 10
  Batch size = 64


ValueError: Number of classes, 4, does not match size of target_names, 6. Try specifying the labels parameter

In [22]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2327
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.87      0.67      0.76      1363
        TrAP       0.72      0.57      0.64       707
        TrCP       0.33      0.84      0.47       147
       TrNAP       0.32      0.90      0.48        50
        TrIP       0.25      0.95      0.40        41
        TrWP       0.38      0.95      0.55        19

    accuracy                           0.66      2327
   macro avg       0.48      0.81      0.55      2327
weighted avg       0.76      0.66      0.69      2327



In [45]:
# empty cuda cache
import torch
torch.cuda.empty_cache()

In [46]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Jan 26 16:28:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    40W /  70W |   3404MiB / 15109MiB |      0%      Default |
|                               |            

## Model Problem - Problem

### Import data

In [8]:
re_task = "P_P"

In [9]:
relations_df = pd.read_csv(re_data_path + os.sep + f"re_scibert_data_{re_task}.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,<< C5-6 disc herniation >> with [[ cord compre...,1
1,<< C5-6 disc herniation >> with cord compressi...,1
2,[[ C5-6 disc herniation ]] with << cord compre...,0
3,C5-6 disc herniation with << cord compression ...,0
4,[[ C5-6 disc herniation ]] with cord compressi...,0
...,...,...
10279,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
10280,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
10281,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
10282,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0


In [10]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=10, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 10274
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 10
 }))

In [11]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 0    9520
1     754
Name: label, dtype: int64
val_df labels: 0    9
1    1
Name: label, dtype: int64


In [12]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [13]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/11 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Sample 1824 of the training set: {'text': 'Gen : in << NAD >> , but winces in [[ pain ]] with movements of his back .', 'label': 0, 'input_ids': [102, 341, 862, 121, 962, 962, 8101, 1374, 1374, 422, 563, 8168, 565, 121, 260, 260, 2675, 1901, 1901, 190, 6873, 131, 1972, 1542, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 409 of the training set: {'text': 'Return to the ED for worsening chest pain , shortness of breath , nausea / vomiting , [[ fever ]] / chills , << weakness >> / dizziness/numbness , or any other problems .', 'label': 0, 'input_ids': [102, 3988, 147, 111, 777, 168, 22096, 8693, 2675, 422, 2001, 1076, 131, 10062, 422, 18816, 1352, 18644, 422, 260, 260, 10551, 1901, 1901, 1352, 7418, 4078, 30113, 422, 962, 962, 11688, 1374, 1374, 1352, 346, 10207, 2902, 1352, 541, 6605, 250, 422, 234, 843, 494, 2010, 205, 

In [14]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [15]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [16]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

### Training

In [17]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

class_weights = [len(train_df)/ (len(train_df[train_df["label"] == i])*len(id2label)) for i in id2label.keys()]
class_weights = torch.tensor(class_weights).log1p()

print(class_weights)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

tensor([0.4315, 2.0558])


In [18]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}-1"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [19]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 10274
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 644


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.2414,0.375891,1.0,1.0,1.0,1.0
2,0.2863,0.301649,0.803922,0.75,0.944444,0.9
3,0.0429,0.326441,0.473684,0.45,0.5,0.9
4,0.0672,0.423108,0.473684,0.45,0.5,0.9


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




In [20]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-P_P-1
Configuration saved in models/scibert_scivocab_uncased-re-P_P-1/config.json
Model weights saved in models/scibert_scivocab_uncased-re-P_P-1/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-P_P-1/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-P_P-1/special_tokens_map.json


***** train metrics *****
  epoch                    =        4.0
  total_flos               =  2926007GF
  train_loss               =     0.2803
  train_runtime            = 0:09:41.74
  train_samples            =      10274
  train_samples_per_second =     70.642
  train_steps_per_second   =      1.107


In [38]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2057
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =        5.0
  eval_accuracy           =     0.8843
  eval_f1                 =     0.7109
  eval_loss               =     0.4348
  eval_precision          =     0.6696
  eval_recall             =     0.8217
  eval_runtime            = 0:00:11.14
  eval_samples            =       2057
  eval_samples_per_second =    184.569
  eval_steps_per_second   =      2.961


In [39]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2057
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.98      0.90      0.93      1906
         PIP       0.36      0.75      0.49       151

    accuracy                           0.88      2057
   macro avg       0.67      0.82      0.71      2057
weighted avg       0.93      0.88      0.90      2057



In [40]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 8227
  Batch size = 64


              precision    recall  f1-score   support

       Other       1.00      0.91      0.95      7623
         PIP       0.46      0.96      0.62       604

    accuracy                           0.92      8227
   macro avg       0.73      0.93      0.79      8227
weighted avg       0.96      0.92      0.93      8227



## Final Predictions

In [21]:
val_data_path = "data/test"

In [23]:
text_files = glob.glob(val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
# df = df[df.text != ""]
# df = df.reset_index(drop=True)

# add concepts
rel_df = pd.DataFrame()
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(val_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    
    concept_df = pd.DataFrame(concept_dict).drop(columns=["end_line"])
    test_concept_df = concept_df[concept_df["concept_type"] == "test"]
    problem_concept_df = concept_df[concept_df["concept_type"] == "problem"]
    treatment_concept_df = concept_df[concept_df["concept_type"] == "treatment"]

    # class test --> problem
    test_problem_df = pd.merge(test_concept_df, problem_concept_df, how="inner", on="start_line")

    # class treatment --> problem
    treatment_problem_df = pd.merge(treatment_concept_df, problem_concept_df, how="inner", on="start_line")

    # class problem --> problem
    problem_problem_df = pd.merge(problem_concept_df, problem_concept_df, how="inner", on="start_line")
    problem_problem_df = problem_problem_df[problem_problem_df["concept_text_x"] != problem_problem_df["concept_text_y"]] # TODO: remove duplicates ?

    tmp = pd.concat([test_problem_df, treatment_problem_df, problem_problem_df], axis=0)
    tmp["filename"] = fname
    rel_df = rel_df.append(tmp, ignore_index=True)
            
rel_df = rel_df.sort_values(by=["filename", "start_line"])
rel_df = rel_df.reset_index(drop=True)

rel_df = rel_df[[ "filename", "start_line", "concept_text_x", "concept_text_y", "concept_type_x", "concept_type_y", "start_word_number_x", "end_word_number_x", "start_word_number_y", "end_word_number_y"]]
rel_df

100%|██████████| 128/128 [00:00<00:00, 1059.87it/s]
100%|██████████| 128/128 [00:01<00:00, 86.04it/s]


Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y
0,0001,18,Mesenteric angiograpm,bleeding vessel,treatment,problem,0,1,6,7
1,0001,18,coil embolization,bleeding vessel,treatment,problem,3,4,6,7
2,0001,22,cabg,dm2,treatment,problem,13,13,9,9
3,0001,22,cabg,cad,treatment,problem,13,13,11,11
4,0001,22,cabg,DVT,treatment,problem,13,13,15,15
...,...,...,...,...,...,...,...,...,...,...
18127,0477,109,Saline wet to dry dressing,penis and pelvis decubiti,treatment,problem,0,4,10,13
18128,0477,109,40% humidified oxygen,penis and pelvis decubiti,treatment,problem,15,17,10,13
18129,0477,109,Fluconazole,penis and pelvis decubiti,treatment,problem,19,19,10,13
18130,0477,132,a surgical procedure,his constrictive pericarditis,treatment,problem,18,20,13,15


In [24]:
# make predict dataset
def preprocess_text(row):
    # find line
    line =  df[(df["filename"] == row["filename"]) & (df["line_number"] == row["start_line"]-1)]["text"].values[0]
    # line = line.lower()
    line = " ".join(line.split()) # remove multiple spaces

    concept_text_x = "<< "+ " ".join(line.split()[row["start_word_number_x"]:row["end_word_number_x"]+1]) + " >>"
    concept_text_y = "[[ " + " ".join(line.split()[row["start_word_number_y"]:row["end_word_number_y"]+1]) + " ]]"
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]

    if row["start_word_number_x"] > row["start_word_number_y"]:
        concept_text_x, concept_text_y = concept_text_y, concept_text_x
        start_word_number_x, start_word_number_y = start_word_number_y, start_word_number_x
        end_word_number_x, end_word_number_y = end_word_number_y, end_word_number_x
    text = " ".join(line.split()[: start_word_number_x] + [concept_text_x] + line.split()[end_word_number_x+1: start_word_number_y] + [concept_text_y] + line.split()[end_word_number_y+1:])

    row["text"] = text
    return row

predict_df = rel_df.apply(preprocess_text, axis=1)
predict_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,text
0,0001,18,Mesenteric angiograpm,bleeding vessel,treatment,problem,0,1,6,7,<< Mesenteric angiograpm >> w/ coil embolizati...
1,0001,18,coil embolization,bleeding vessel,treatment,problem,3,4,6,7,Mesenteric angiograpm w/ << coil embolization ...
2,0001,22,cabg,dm2,treatment,problem,13,13,9,9,"HPI: Pt is a 71 y/o male with h/o [[ dm2 ]] , ..."
3,0001,22,cabg,cad,treatment,problem,13,13,11,11,"HPI: Pt is a 71 y/o male with h/o dm2 , [[ cad..."
4,0001,22,cabg,DVT,treatment,problem,13,13,15,15,"HPI: Pt is a 71 y/o male with h/o dm2 , cad s/..."
...,...,...,...,...,...,...,...,...,...,...,...
18127,0477,109,Saline wet to dry dressing,penis and pelvis decubiti,treatment,problem,0,4,10,13,<< Saline wet to dry dressing >> changes three...
18128,0477,109,40% humidified oxygen,penis and pelvis decubiti,treatment,problem,15,17,10,13,Saline wet to dry dressing changes three times...
18129,0477,109,Fluconazole,penis and pelvis decubiti,treatment,problem,19,19,10,13,Saline wet to dry dressing changes three times...
18130,0477,132,a surgical procedure,his constrictive pericarditis,treatment,problem,18,20,13,15,The patient was seen at Ph University Of Medic...


In [25]:
orig_predict_df = predict_df.copy()

In [36]:
re_task = "Tr_P"

if re_task == "P_P":
    # problem --> problem
    predict_df = orig_predict_df[(orig_predict_df["concept_type_x"] == "problem") & (orig_predict_df["concept_type_y"] == "problem")]
    label2id = {'Other': 0, 'PIP': 1}
elif re_task == "Tr_P":
    # treatment --> problem
    predict_df = orig_predict_df[(orig_predict_df["concept_type_x"] == "treatment") & (orig_predict_df["concept_type_y"] == "problem")]
    label2id = {'Other': 0, 'TrAP': 1, 'TrCP': 2, 'TrNAP': 3, 'TrIP': 4, 'TrWP': 5}
elif re_task == "Te_P":
    # test --> problem
    predict_df = orig_predict_df[(orig_predict_df["concept_type_x"] == "test") & (orig_predict_df["concept_type_y"] == "problem")]
    label2id = {'Other': 0, 'TeRP': 1, 'TeCP': 2}
id2label = {v: k for k, v in label2id.items()}


model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}-1"
model = AutoModelForSequenceClassification.from_pretrained(f"models/{model_folder_name}", label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(f"models/{model_folder_name}")

# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None
    
# Initialize our Trainer
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = Trainer(
    model=model,
    # args=args,
    # train_dataset=train_dataset ,
    # eval_dataset=eval_dataset ,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

loading configuration file models/scibert_scivocab_uncased-re-Tr_P-1/config.json
Model config BertConfig {
  "_name_or_path": "models/scibert_scivocab_uncased-re-Tr_P-1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "re",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Other",
    "1": "TrAP",
    "2": "TrCP",
    "3": "TrNAP",
    "4": "TrIP",
    "5": "TrWP"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Other": 0,
    "TrAP": 1,
    "TrCP": 2,
    "TrIP": 4,
    "TrNAP": 3,
    "TrWP": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_versi

In [37]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )

predict_dataset = Dataset.from_pandas(predict_df, preserve_index=False)
# predict_dataset = predict_dataset.select(range(10))
predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on prediction dataset",
            )
predict_dataset

Running tokenizer on prediction dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['filename', 'start_line', 'concept_text_x', 'concept_text_y', 'concept_type_x', 'concept_type_y', 'start_word_number_x', 'end_word_number_x', 'start_word_number_y', 'end_word_number_y', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3521
})

In [38]:
predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
len(predictions)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: start_word_number_y, start_word_number_x, concept_text_y, start_line, text, end_word_number_y, end_word_number_x, concept_type_x, concept_text_x, concept_type_y, filename.
***** Running Prediction *****
  Num examples = 3521
  Batch size = 16


3521

In [39]:
predict_df["prediction"] = [id2label[label] for label in predictions]
rel_df.loc[predict_df.index, "prediction"] = predict_df["prediction"]
rel_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,prediction
0,0001,18,Mesenteric angiograpm,bleeding vessel,treatment,problem,0,1,6,7,TrAP
1,0001,18,coil embolization,bleeding vessel,treatment,problem,3,4,6,7,TrAP
2,0001,22,cabg,dm2,treatment,problem,13,13,9,9,Other
3,0001,22,cabg,cad,treatment,problem,13,13,11,11,Other
4,0001,22,cabg,DVT,treatment,problem,13,13,15,15,Other
...,...,...,...,...,...,...,...,...,...,...,...
18127,0477,109,Saline wet to dry dressing,penis and pelvis decubiti,treatment,problem,0,4,10,13,TrAP
18128,0477,109,40% humidified oxygen,penis and pelvis decubiti,treatment,problem,15,17,10,13,Other
18129,0477,109,Fluconazole,penis and pelvis decubiti,treatment,problem,19,19,10,13,TrAP
18130,0477,132,a surgical procedure,his constrictive pericarditis,treatment,problem,18,20,13,15,TrAP


In [40]:
rel_df["prediction"].value_counts()


Other    14163
TrAP      1207
PIP       1161
TeRP      1140
TrCP       161
TeCP       150
TrIP        82
TrNAP       54
TrWP        14
Name: prediction, dtype: int64

you can now set another re_task

In [47]:
# for each file create <filename>.con
os.makedirs(val_data_path + os.sep + rel_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
for file in files:
    os.remove(file)

for i, row in tqdm(rel_df.iterrows()):
    filename = row["filename"]
    concept_text_x = row["concept_text_x"].lower()
    concept_text_y = row["concept_text_y"].lower()
    concept_type_x = row["concept_type_x"]
    concept_type_y = row["concept_type_y"]
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]
    line_number = row["start_line"]
    prediction = row["prediction"]
    if prediction != "Other":
        with open(val_data_path + os.sep + rel_folder_name + os.sep + filename + ".rel", "a") as f:
            # fill like this c="pefusion imaging" 19:6 19:7||r="TeRP"||c="perfusion defects" 19:12 19:13
            f.write(
                f"c=\"{concept_text_x}\" {line_number}:{start_word_number_x} {line_number}:{end_word_number_x}||r=\"{prediction}\"||c=\"{concept_text_y}\" {line_number}:{start_word_number_y} {line_number}:{end_word_number_y}\n"
            )
    


18132it [00:01, 14043.15it/s]


In [48]:
rel_files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
rel_files = [f.split(os.sep)[-1][:-4] for f in rel_files]
txt_files = [f.split(os.sep)[-1][:-4] for f in text_files]
# find missing files
missing_files = set(txt_files) - set(rel_files)
missing_files


{'0046', '0154', '0182', '0226', '0265', '0274', '0326', '0413'}

In [49]:
# create empty files for missing files
for f in missing_files:
    with open(val_data_path + os.sep + rel_folder_name + os.sep + f + ".rel", "w") as f:
        f.write("")

In [50]:
rel_files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
rel_files = [f.split(os.sep)[-1][:-4] for f in rel_files]
txt_files = [f.split(os.sep)[-1][:-4] for f in text_files]
# find missing files
missing_files = set(txt_files) - set(rel_files)
missing_files

set()

In [51]:
!cat data/test/rel/0001.rel

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
c="mesenteric angiograpm" 18:0 18:1||r="TrAP"||c="bleeding vessel" 18:6 18:7
c="coil embolization" 18:3 18:4||r="TrAP"||c="bleeding vessel" 18:6 18:7
c="long term anti-coagulation" 22:17 22:19||r="TrAP"||c="dvt" 22:15 22:15
c="long term anti-coagulation" 22:17 22:19||r="TrAP"||c="pe" 22:15 22:15
c="ciprofloxacin" 24:5 24:5||r="TrAP"||c="a uti" 24:7 24:8
c="an initial dre" 27:1 27:3||r="TeRP"||c="clot" 27:6 27:6
c="3v-cabg" 34:3 34:3||r="TrAP"||c="cad" 34:1 34:1
c="ef" 35:3 35:3||r="TeRP"||c="chf" 35:1 35:1
c="1996 cardiac" 36:6 36:7||r="TrCP"||c="right parietal intracranial bleeding" 36:1 36:4
c="s/p" 38:4 38:4||r="TrAP"||c="sinus node dysfunction" 38:1 38:3
c="lifetime" 39:13 39:13||r="TrAP"||c="subsequent 

In [52]:
!zip -r scibert-test-rel-2-sep.zip data/test/rel/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: data/test/rel/ (stored 0%)
  adding: data/test/rel/0233.rel (deflated 64%)
  adding: data/test/rel/0463.rel (deflated 65%)
  adding: data/test/rel/0427.rel (deflated 73%)
  adding: data/test/rel/0058.rel (deflated 72%)
  adding: data/test/rel/0460.rel (deflated 74%)
  adding: data/test/rel/0265.rel (stored 0%)
  adding: data/test/rel/0214.rel (deflated 66%)
  adding: data/test/rel/0021.rel (deflated 68%)
  adding: data/test/rel/0415.rel (deflated 51%)
  adding: data/test/rel/0005.rel (deflated 67%)
  adding: data/test/rel/0281.rel (deflated 69%)
  adding: data/test/rel/0037.rel (deflated 70%)
  adding: data/test/rel/0129.rel (deflated 64%)
  adding: data/test/rel/0462.rel (deflated 60%)
  adding: d