[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Relations classification

Based of: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_xnli.py

In [2]:
!nvidia-smi

Tue Jan 25 13:27:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0    32W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   68C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [17]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

ModuleNotFoundError: No module named 'google.colab'

In [12]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import random

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import datasets
import numpy as np
from datasets import load_dataset, load_metric , Dataset
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from transformers import pipeline

require_version("datasets>=1.8.0", "To fix: pip install --upgrade datasets")

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [2]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
nli_data_path = "data/nli"
re_data_path = "data/re"

# model args
model_name_or_path = "allenai/scibert_scivocab_uncased" # "gsarti/scibert-nli"  "allenai/scibert_scivocab_uncased"  "models/scibert_scivocab_uncased-re-1"
cache_dir = None
model_revision = None 
tokenizer_name = model_name_or_path
do_lower_case = None
use_fast_tokenizer = True
fp16 = True

# data args
pad_to_max_length = None
max_seq_length = None

set_seed(42)


## Model Test - Problem

### Import data

In [37]:
re_task = "Te_P"

In [38]:
relations_df = pd.read_csv(re_data_path + os.sep + f"re_scibert_data_{re_task}.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,She had << a workup >> by her neurologist and ...,0
1,She had << a workup >> by her neurologist and ...,0
2,She had << a workup >> by her neurologist and ...,0
3,She had a workup by her neurologist and << an ...,1
4,She had a workup by her neurologist and << an ...,1
...,...,...
2145,The patient had << an echocardiogram >> on day...,1
2146,The patient had << an echocardiogram >> on day...,1
2147,The patient had << an echocardiogram >> on day...,1
2148,The patient had << an echocardiogram >> on day...,1


In [5]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=0.2, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 1720
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 430
 }))

In [6]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 1    794
0    793
2    133
Name: label, dtype: int64
val_df labels: 0    199
1    198
2     33
Name: label, dtype: int64


In [7]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [8]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Sample 1309 of the training set: {'text': '<< The liver enzimes >> come down , no signs of [[ biliary stasis ]] .', 'label': 0, 'input_ids': [102, 962, 962, 111, 2993, 279, 5889, 8608, 1374, 1374, 6096, 1922, 422, 425, 6482, 131, 260, 260, 17496, 15934, 4554, 1901, 1901, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 228 of the training set: {'text': '<< Chest x-ray >> shows no [[ infiltrate ]] or pneumothorax .', 'label': 1, 'input_ids': [102, 962, 962, 8693, 412, 579, 7930, 1374, 1374, 1402, 425, 260, 260, 28051, 1901, 1901, 234, 6335, 27948, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 51 of the training set: {'text': 'On << physical examination >> , she was a very pleasant , moderate

In [9]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [10]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [11]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

### Training

In [25]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=list(id2label.keys()),y=train_df["label"])

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [32]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1720
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 270


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.9147,0.862243,0.591877,0.579334,0.655635,0.639535
2,0.7825,0.740154,0.612529,0.602095,0.716114,0.653488
3,0.6154,0.697649,0.646892,0.642525,0.748295,0.686047
4,0.4829,0.641081,0.669257,0.646956,0.727916,0.704651
5,0.4643,0.641414,0.678677,0.661465,0.748194,0.709302
6,0.2148,0.612059,0.712023,0.692206,0.776763,0.737209
7,0.2231,0.626099,0.717725,0.699795,0.756476,0.744186
8,0.2643,0.59374,0.732274,0.706698,0.779977,0.753488
9,0.3745,0.62524,0.732229,0.716297,0.759767,0.760465
10,0.1679,0.628189,0.732404,0.717083,0.766552,0.75814


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 430
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 430
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 430
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 430
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassifi

In [34]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-Te_P
Configuration saved in models/scibert_scivocab_uncased-re-Te_P/config.json
Model weights saved in models/scibert_scivocab_uncased-re-Te_P/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-Te_P/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-Te_P/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  total_flos               =  1050899GF
  train_loss               =      0.494
  train_runtime            = 0:04:01.01
  train_samples            =       1720
  train_samples_per_second =     71.364
  train_steps_per_second   =       1.12


In [35]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 430
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.7581
  eval_f1                 =     0.7324
  eval_loss               =     0.6282
  eval_precision          =     0.7171
  eval_recall             =     0.7666
  eval_runtime            = 0:00:02.10
  eval_samples            =        430
  eval_samples_per_second =     204.41
  eval_steps_per_second   =      3.328


In [36]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 430
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.83      0.67      0.74       199
        TeRP       0.74      0.84      0.79       198
        TeCP       0.58      0.79      0.67        33

    accuracy                           0.76       430
   macro avg       0.72      0.77      0.73       430
weighted avg       0.77      0.76      0.76       430



In [37]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1720
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.95      0.82      0.88       793
        TeRP       0.89      0.94      0.91       794
        TeCP       0.71      1.00      0.83       133

    accuracy                           0.89      1720
   macro avg       0.85      0.92      0.87      1720
weighted avg       0.90      0.89      0.89      1720



## Model Treatment - Problem

### Import data

In [34]:
re_task = "Tr_P"

In [35]:
relations_df = pd.read_csv(re_data_path + os.sep + f"re_scibert_data_{re_task}.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,She had a postoperative CT scan that revealed ...,0
1,[[ Her pain ]] was under good control with << ...,4
2,"3. << Percocet >> , 5/325 , 1-2 tabs PO q4-6h ...",1
3,Take << codeine >> prescribed by PCP with food...,0
4,Take << codeine >> prescribed by PCP with food...,0
...,...,...
2904,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
2905,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
2906,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
2907,The patient was told he could return to work a...,1


In [9]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=0.2, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 2327
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 582
 }))

In [10]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 0    1363
1     707
2     147
3      50
4      41
5      19
Name: label, dtype: int64
val_df labels: 0    341
1    177
2     37
3     12
4     10
5      5
Name: label, dtype: int64


In [11]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [12]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Sample 456 of the training set: {'text': 'There was still concern for her effusion contributing to her SOB at the OSH so she was transferred for << drainage >> and treatment of [[ CHF ]] .', 'label': 0, 'input_ids': [102, 461, 241, 2077, 6366, 168, 1750, 24522, 10027, 147, 1750, 25859, 235, 111, 3581, 30117, 564, 2281, 241, 6388, 168, 962, 962, 12808, 1374, 1374, 137, 922, 131, 260, 260, 25794, 1901, 1901, 205, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 102 of the training set: {'text': "Mr. Pohl is a 53 - year-old male with a history of ETOH , hypertension who presented int he emergency room with increased agitation , like secondary to ETOH intoxication and developed [[ subsequent hypotension ]] , systolic blood pressures in the 80 's, status post << Ativan >> 

In [13]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [14]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [15]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

### Training

In [16]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

class_weights = [len(train_df)/ (len(train_df[train_df["label"] == i])*len(id2label)) for i in id2label.keys()]
print(class_weights)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

[0.28454389826363413, 0.5485619990570486, 2.6383219954648527, 7.756666666666667, 9.459349593495935, 20.412280701754387]


In [17]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [18]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2327
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 185


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,1.751,1.623897,0.266744,0.276491,0.374617,0.494845
2,1.5349,1.461777,0.360805,0.415869,0.440072,0.54811
3,1.3943,1.322689,0.409176,0.400643,0.554285,0.587629
4,0.8592,1.242026,0.459844,0.426182,0.618987,0.62543
5,0.8145,1.209963,0.466858,0.432025,0.640575,0.627148


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 582
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 582
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 582
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 582
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassifi

In [19]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-Tr_P
Configuration saved in models/scibert_scivocab_uncased-re-Tr_P/config.json
Model weights saved in models/scibert_scivocab_uncased-re-Tr_P/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-Tr_P/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-Tr_P/special_tokens_map.json


***** train metrics *****
  epoch                    =        5.0
  total_flos               =  1448866GF
  train_loss               =      1.313
  train_runtime            = 0:05:10.23
  train_samples            =       2327
  train_samples_per_second =     37.504
  train_steps_per_second   =      0.596


In [20]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 582
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =        5.0
  eval_accuracy           =     0.6271
  eval_f1                 =     0.4669
  eval_loss               =       1.21
  eval_precision          =      0.432
  eval_recall             =     0.6406
  eval_runtime            = 0:00:05.38
  eval_samples            =        582
  eval_samples_per_second =      108.0
  eval_steps_per_second   =      1.856


In [21]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 582
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.84      0.65      0.73       341
        TrAP       0.73      0.56      0.63       177
        TrCP       0.27      0.70      0.39        37
       TrNAP       0.48      0.83      0.61        12
        TrIP       0.14      0.70      0.23        10
        TrWP       0.14      0.40      0.21         5

    accuracy                           0.63       582
   macro avg       0.43      0.64      0.47       582
weighted avg       0.74      0.63      0.66       582



In [22]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2327
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.87      0.67      0.76      1363
        TrAP       0.72      0.57      0.64       707
        TrCP       0.33      0.84      0.47       147
       TrNAP       0.32      0.90      0.48        50
        TrIP       0.25      0.95      0.40        41
        TrWP       0.38      0.95      0.55        19

    accuracy                           0.66      2327
   macro avg       0.48      0.81      0.55      2327
weighted avg       0.76      0.66      0.69      2327



In [23]:
# empty cuda cache
import torch
torch.cuda.empty_cache()

In [24]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Jan 25 17:24:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0    45W /  70W |   3452MiB / 15109MiB |      0%      Default |
|                               |            

## Model Problem - Problem

### Import data

In [25]:
re_task = "P_P"

In [32]:
relations_df = pd.read_csv(re_data_path + os.sep + f"re_scibert_data_{re_task}.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(relations_df["label"].value_counts().index.tolist())}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,<< C5-6 disc herniation >> with [[ cord compre...,1
1,<< C5-6 disc herniation >> with cord compressi...,1
2,[[ C5-6 disc herniation ]] with << cord compre...,0
3,C5-6 disc herniation with << cord compression ...,0
4,[[ C5-6 disc herniation ]] with cord compressi...,0
...,...,...
10279,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
10280,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
10281,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0
10282,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",0


In [27]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=0.2, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 8227
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 2057
 }))

In [28]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 0    7623
1     604
Name: label, dtype: int64
val_df labels: 0    1906
1     151
Name: label, dtype: int64


In [29]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    # do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

loading configuration file https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json from cache at /home/jupyter/.cache/huggingface/transformers/858852fd2471ce39075378592ddc87f5a6551e64c6825d1b92c8dab9318e0fc3.03ff9e9f998b9a9d40647a2148a202e3fb3d568dc0f170dda9dda194bab4d5dd
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "re",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Other",
    "1": "PIP"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Other": 0,
    "PIP": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "

In [30]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/9 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Sample 1824 of the training set: {'text': 'MOUTH NORMAL NECK NORMAL thyroid wnl BREASTS NORMAL no distinct masses NIPPLES NORMAL << inverted >> [ b ] , evert w / stimulation CHEST NORMAL LCTA COR NORMAL RRR ABDOMEN NORMAL gravid EXTREM NORMAL SKIN NORMAL NODES NORMAL VULVA NORMAL no lesions , [[ white d / c at introitus ]] VAGINA NORMAL sml amt thin white d / c ph 4.5 , koh +amine , NS +clue , neg trich CERVIX NORMAL 1/100/0 srom clear OS NORMAL closed ADNEXAE NORMAL no palp masses , NT UTERUS NORMAL gravid UTERINE SIZE IN WEEKS NORMAL term RECTUM NORMAL no ext lesions', 'label': 0, 'input_ids': [102, 12860, 1346, 7980, 1346, 8143, 15239, 30115, 3479, 30113, 1346, 425, 3646, 9686, 26119, 1024, 1346, 962, 962, 13455, 1374, 1374, 260, 132, 1901, 422, 1661, 30108, 124, 1352, 4156, 8693, 1346, 6087, 2219, 470, 1346, 5058, 30114, 17748, 1346, 10377, 173, 4847, 1346, 3843, 1346, 2207, 1346, 5992, 3833, 1346, 425, 4278, 422, 260, 260, 3606, 128, 1352, 115, 235, 1205, 8592, 1901, 1901, 7988, 1

In [31]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [32]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [33]:
model_name_or_path

'allenai/scibert_scivocab_uncased'

### Training

In [34]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

class_weights = [len(train_df)/ (len(train_df[train_df["label"] == i])*len(id2label)) for i in id2label.keys()]
print(class_weights)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

[0.5396169487078578, 6.810430463576159]


In [35]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [36]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 8227
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 645


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.3724,0.432803,0.611964,0.607165,0.817301,0.774429
2,0.3238,0.437401,0.707597,0.672974,0.77617,0.895965
3,0.237,0.389542,0.70108,0.660426,0.841277,0.869713
4,0.3278,0.403065,0.697631,0.657812,0.837179,0.867769
5,0.1758,0.434782,0.710932,0.669617,0.821706,0.884298


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2057
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2057
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2057
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2057
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClas

In [37]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_uncased-re-P_P
Configuration saved in models/scibert_scivocab_uncased-re-P_P/config.json
Model weights saved in models/scibert_scivocab_uncased-re-P_P/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_uncased-re-P_P/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_uncased-re-P_P/special_tokens_map.json


***** train metrics *****
  epoch                    =        5.0
  total_flos               =  2953218GF
  train_loss               =     0.3607
  train_runtime            = 0:10:58.19
  train_samples            =       8227
  train_samples_per_second =     62.497
  train_steps_per_second   =       0.98


In [38]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 2057
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =        5.0
  eval_accuracy           =     0.8843
  eval_f1                 =     0.7109
  eval_loss               =     0.4348
  eval_precision          =     0.6696
  eval_recall             =     0.8217
  eval_runtime            = 0:00:11.14
  eval_samples            =       2057
  eval_samples_per_second =    184.569
  eval_steps_per_second   =      2.961


In [39]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2057
  Batch size = 64


              precision    recall  f1-score   support

       Other       0.98      0.90      0.93      1906
         PIP       0.36      0.75      0.49       151

    accuracy                           0.88      2057
   macro avg       0.67      0.82      0.71      2057
weighted avg       0.93      0.88      0.90      2057



In [40]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 8227
  Batch size = 64


              precision    recall  f1-score   support

       Other       1.00      0.91      0.95      7623
         PIP       0.46      0.96      0.62       604

    accuracy                           0.92      8227
   macro avg       0.73      0.93      0.79      8227
weighted avg       0.96      0.92      0.93      8227



## Final Predictions

In [43]:
text_files = glob.glob(val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
# df = df[df.text != ""]
# df = df.reset_index(drop=True)

# add concepts
rel_df = pd.DataFrame()
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(val_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    
    concept_df = pd.DataFrame(concept_dict).drop(columns=["end_line"])
    test_concept_df = concept_df[concept_df["concept_type"] == "test"]
    problem_concept_df = concept_df[concept_df["concept_type"] == "problem"]
    treatment_concept_df = concept_df[concept_df["concept_type"] == "treatment"]

    # class test --> problem
    test_problem_df = pd.merge(test_concept_df, problem_concept_df, how="inner", on="start_line")

    # class treatment --> problem
    treatment_problem_df = pd.merge(treatment_concept_df, problem_concept_df, how="inner", on="start_line")

    # class problem --> problem
    problem_problem_df = pd.merge(problem_concept_df, problem_concept_df, how="inner", on="start_line")
    problem_problem_df = problem_problem_df[problem_problem_df["concept_text_x"] != problem_problem_df["concept_text_y"]] # TODO: remove duplicates ?

    tmp = pd.concat([test_problem_df, treatment_problem_df, problem_problem_df], axis=0)
    tmp["filename"] = fname
    rel_df = rel_df.append(tmp, ignore_index=True)
            
rel_df = rel_df.sort_values(by=["filename", "start_line"])
rel_df = rel_df.reset_index(drop=True)

rel_df = rel_df[[ "filename", "start_line", "concept_text_x", "concept_text_y", "concept_type_x", "concept_type_y", "start_word_number_x", "end_word_number_x", "start_word_number_y", "end_word_number_y"]]
rel_df

100%|██████████| 128/128 [00:00<00:00, 1032.03it/s]
100%|██████████| 128/128 [00:01<00:00, 82.97it/s]


Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y
0,0006,5,cardiac cath,vt,test,problem,2,3,0,0
1,0006,5,stent,vt,treatment,problem,5,5,0,0
2,0006,5,amp,vt,treatment,problem,7,7,0,0
3,0006,5,amio loading,vt,treatment,problem,9,10,0,0
4,0006,87,house,low chol,problem,problem,0,0,2,3
...,...,...,...,...,...,...,...,...,...,...
14330,0475,109,the serum albumin gradient,portal hypertension,test,problem,0,3,8,9
14331,0475,112,her platelet count,low,test,problem,0,2,5,5
14332,0475,112,cimetidine,low,treatment,problem,15,15,5,5
14333,0475,120,beta blocker,a persistent wheeze,treatment,problem,9,10,5,7


In [44]:
# make predict dataset
def preprocess_text(row):
    # find line
    line =  df[(df["filename"] == row["filename"]) & (df["line_number"] == row["start_line"]-1)]["text"].values[0]
    # line = line.lower()
    line = " ".join(line.split()) # remove multiple spaces

    concept_text_x = "<< "+ " ".join(line.split()[row["start_word_number_x"]:row["end_word_number_x"]+1]) + " >>"
    concept_text_y = "[[ " + " ".join(line.split()[row["start_word_number_y"]:row["end_word_number_y"]+1]) + " ]]"
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]

    if row["start_word_number_x"] > row["start_word_number_y"]:
        concept_text_x, concept_text_y = concept_text_y, concept_text_x
        start_word_number_x, start_word_number_y = start_word_number_y, start_word_number_x
        end_word_number_x, end_word_number_y = end_word_number_y, end_word_number_x
    text = " ".join(line.split()[: start_word_number_x] + [concept_text_x] + line.split()[end_word_number_x+1: start_word_number_y] + [concept_text_y] + line.split()[end_word_number_y+1:])

    row["text"] = text
    return row

predict_df = rel_df.apply(preprocess_text, axis=1)
predict_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,text
0,0006,5,cardiac cath,vt,test,problem,2,3,0,0,"[[ VT ]] s/p << cardiac cath >> , stent and am..."
1,0006,5,stent,vt,treatment,problem,5,5,0,0,"[[ VT ]] s/p cardiac cath , << stent >> and am..."
2,0006,5,amp,vt,treatment,problem,7,7,0,0,"[[ VT ]] s/p cardiac cath , stent and << amp >..."
3,0006,5,amio loading,vt,treatment,problem,9,10,0,0,"[[ VT ]] s/p cardiac cath , stent and amp ; <<..."
4,0006,87,house,low chol,problem,problem,0,0,2,3,<< House >> / [[ Low chol ]] / low sat. fat
...,...,...,...,...,...,...,...,...,...,...,...
14330,0475,109,the serum albumin gradient,portal hypertension,test,problem,0,3,8,9,<< The serum albumin gradient >> was 1.8 consi...
14331,0475,112,her platelet count,low,test,problem,0,2,5,5,<< Her platelet count >> stayed persistently [...
14332,0475,112,cimetidine,low,treatment,problem,15,15,5,5,Her platelet count stayed persistently [[ low ...
14333,0475,120,beta blocker,a persistent wheeze,treatment,problem,9,10,5,7,The patient however complained of [[ a persist...


In [45]:
orig_predict_df = predict_df.copy()

In [56]:
re_task = "Te_P"

if re_task == "P_P":
    # problem --> problem
    predict_df = orig_predict_df[(orig_predict_df["concept_type_x"] == "problem") & (orig_predict_df["concept_type_y"] == "problem")]
    label2id = {'Other': 0, 'PIP': 1}
elif re_task == "Tr_P":
    # treatment --> problem
    predict_df = orig_predict_df[(orig_predict_df["concept_type_x"] == "treatment") & (orig_predict_df["concept_type_y"] == "problem")]
    label2id = {'Other': 0, 'TrAP': 1, 'TrCP': 2, 'TrNAP': 3, 'TrIP': 4, 'TrWP': 5}
elif re_task == "Te_P":
    # test --> problem
    predict_df = orig_predict_df[(orig_predict_df["concept_type_x"] == "test") & (orig_predict_df["concept_type_y"] == "problem")]
    label2id = {'Other': 0, 'TeRP': 1, 'TeCP': 2}
id2label = {v: k for k, v in label2id.items()}


model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-{re_task}"
model = AutoModelForSequenceClassification.from_pretrained(f"models/{model_folder_name}", label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(f"models/{model_folder_name}")

# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None
    
# Initialize our Trainer
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = Trainer(
    model=model,
    # args=args,
    # train_dataset=train_dataset ,
    # eval_dataset=eval_dataset ,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

loading configuration file models/scibert_scivocab_uncased-re-Te_P/config.json
Model config BertConfig {
  "_name_or_path": "models/scibert_scivocab_uncased-re-Te_P",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "re",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Other",
    "1": "TeRP",
    "2": "TeCP"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Other": 0,
    "TeCP": 2,
    "TeRP": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading weights f

In [57]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )

predict_dataset = Dataset.from_pandas(predict_df, preserve_index=False)
# predict_dataset = predict_dataset.select(range(10))
predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on prediction dataset",
            )
predict_dataset

Running tokenizer on prediction dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['filename', 'start_line', 'concept_text_x', 'concept_text_y', 'concept_type_x', 'concept_type_y', 'start_word_number_x', 'end_word_number_x', 'start_word_number_y', 'end_word_number_y', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2261
})

In [58]:
predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
len(predictions)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: filename, start_line, concept_text_y, start_word_number_x, end_word_number_x, concept_type_x, concept_type_y, end_word_number_y, start_word_number_y, concept_text_x, text.
***** Running Prediction *****
  Num examples = 2261
  Batch size = 16


2261

In [59]:
predict_df["prediction"] = [id2label[label] for label in predictions]
rel_df.loc[predict_df.index, "prediction"] = predict_df["prediction"]
rel_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,prediction
0,0006,5,cardiac cath,vt,test,problem,2,3,0,0,Other
1,0006,5,stent,vt,treatment,problem,5,5,0,0,Other
2,0006,5,amp,vt,treatment,problem,7,7,0,0,Other
3,0006,5,amio loading,vt,treatment,problem,9,10,0,0,Other
4,0006,87,house,low chol,problem,problem,0,0,2,3,Other
...,...,...,...,...,...,...,...,...,...,...,...
14330,0475,109,the serum albumin gradient,portal hypertension,test,problem,0,3,8,9,TeRP
14331,0475,112,her platelet count,low,test,problem,0,2,5,5,TeRP
14332,0475,112,cimetidine,low,treatment,problem,15,15,5,5,TrCP
14333,0475,120,beta blocker,a persistent wheeze,treatment,problem,9,10,5,7,TrNAP


In [60]:
rel_df["prediction"].value_counts()


Other    9776
PIP      1564
TeRP     1066
TrAP      648
TrCP      435
TrIP      403
TeCP      272
TrNAP     160
TrWP       11
Name: prediction, dtype: int64

you can now set another re_task

In [65]:
# for each file create <filename>.con
os.makedirs(val_data_path + os.sep + rel_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
for file in files:
    os.remove(file)

for i, row in tqdm(rel_df.iterrows()):
    filename = row["filename"]
    concept_text_x = row["concept_text_x"]
    concept_text_y = row["concept_text_y"]
    concept_type_x = row["concept_type_x"]
    concept_type_y = row["concept_type_y"]
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]
    line_number = row["start_line"]
    prediction = row["prediction"]
    if prediction != "Other":
        with open(val_data_path + os.sep + rel_folder_name + os.sep + filename + ".rel", "a") as f:
            # fill like this c="pefusion imaging" 19:6 19:7||r="TeRP"||c="perfusion defects" 19:12 19:13
            f.write(
                f"c=\"{concept_text_x}\" {line_number}:{start_word_number_x} {line_number}:{end_word_number_x}||r=\"{prediction}\"||c=\"{concept_text_y}\" {line_number}:{start_word_number_y} {line_number}:{end_word_number_y}\n"
            )
    


14335it [00:01, 13333.71it/s]


In [66]:
rel_files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
rel_files = [f.split(os.sep)[-1][:-4] for f in rel_files]
txt_files = [f.split(os.sep)[-1][:-4] for f in text_files]
# find missing files
missing_files = set(txt_files) - set(rel_files)
missing_files


{'0049', '0066', '0146', '0202', '0230', '0305', '0366', '0398'}

In [67]:
# create empty files for missing files
for f in missing_files:
    with open(val_data_path + os.sep + rel_folder_name + os.sep + f + ".rel", "w") as f:
        f.write("")

In [68]:
rel_files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
rel_files = [f.split(os.sep)[-1][:-4] for f in rel_files]
txt_files = [f.split(os.sep)[-1][:-4] for f in text_files]
# find missing files
missing_files = set(txt_files) - set(rel_files)
missing_files

set()

In [69]:
!cat data/val/rel/0006.rel

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
c="house" 87:0 87:0||r="PIP"||c="low sat" 87:5 87:6
c="fat" 87:7 87:7||r="PIP"||c="house" 87:0 87:0
c="fat" 87:7 87:7||r="PIP"||c="low chol" 87:2 87:3
c="monitor" 104:10 104:10||r="TeRP"||c="vt" 104:8 104:8
c="further icd shocks" 104:4 104:6||r="TrCP"||c="vt" 104:8 104:8
c="exam" 106:0 106:0||r="TeRP"||c="b / l carotid bruits" 106:3 106:7
c="exam" 106:0 106:0||r="TeRP"||c="soft murmur at l sternal border" 106:9 106:14
c="exam" 106:0 106:0||r="TeRP"||c="1 + b / l edema" 106:19 106:23
c="ecg" 117:0 117:0||r="TeRP"||c="prior" 117:5 117:5
c="ecg" 117:0 117:0||r="TeRP"||c="shower r" 117:6 117:7
c="ecg" 117:0 117:0||r="TeRP"||c="vp" 117:10 117:10
c="ecg" 117:0 117:0||r="TeRP"||c="monomorphic vt" 117:12 117:13
c="a

In [70]:
!zip -r scibert-val-rel-3-sep.zip data/val/rel/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: data/val/rel/ (stored 0%)
  adding: data/val/rel/0309.rel (deflated 77%)
  adding: data/val/rel/0085.rel (deflated 56%)
  adding: data/val/rel/0101.rel (deflated 76%)
  adding: data/val/rel/0382.rel (deflated 65%)
  adding: data/val/rel/0282.rel (deflated 56%)
  adding: data/val/rel/0194.rel (deflated 80%)
  adding: data/val/rel/0161.rel (deflated 78%)
  adding: data/val/rel/0169.rel (deflated 70%)
  adding: data/val/rel/0049.rel (stored 0%)
  adding: data/val/rel/0317.rel (deflated 65%)
  adding: data/val/rel/0294.rel (deflated 64%)
  adding: data/val/rel/0273.rel (deflated 70%)
  adding: data/val/rel/0369.rel (deflated 78%)
  adding: data/val/rel/0464.rel (deflated 74%)
  adding: data/val/rel/032