[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Relations classification

Based of: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_xnli.py

In [1]:
!nvidia-smi

Mon Jan 24 23:48:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0    34W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   71C    P0    31W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [17]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

ModuleNotFoundError: No module named 'google.colab'

In [1]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import random

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import datasets
import numpy as np
from datasets import load_dataset, load_metric , Dataset
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from transformers import pipeline

require_version("datasets>=1.8.0", "To fix: pip install --upgrade datasets")

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

/home/jupyter/medical_txt_parser/src/notebooks
/home/jupyter/medical_txt_parser/src
/home/jupyter/medical_txt_parser


In [2]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
nli_data_path = "data/nli"
re_data_path = "data/re"

# model args
model_name_or_path = "allenai/scibert_scivocab_cased" # "gsarti/scibert-nli"
cache_dir = None
model_revision = None 
tokenizer_name = model_name_or_path
do_lower_case = True
use_fast_tokenizer = True
fp16 = True

# data args
pad_to_max_length = None
max_seq_length = None

set_seed(42)


### Import data

In [3]:
relations_df = pd.read_csv(re_data_path + os.sep + "re_data_scibert.tsv", sep="\t", header=None)
relations_df.columns = ["text", "label"]
label2id = {label: i for i, label in enumerate(set(relations_df.label))}
id2label = {i: label for label, i in label2id.items()}
relations_df["label"] = relations_df.label.map(label2id)
relations_df

Unnamed: 0,text,label
0,41 yo man with [[ CRFs ]] of << DM Type II >> ...,7
1,"Here , had T wave flattening laterally and inf...",6
2,"Here , had << T wave flattening laterally and ...",7
3,Had << Chest CT >> to r / o dissection ( due t...,0
4,Had << Chest CT >> to r / o [[ dissection ]] (...,0
...,...,...
3113,"She presented to her primary care physician , ...",6
3114,<< Percocet >> one to two whenever necessary q...,3
3115,<< Specimens >> sent to pathology included per...,0
3116,"There were no [[ complications ]] , and the pa...",4


In [4]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(relations_df, train_size=None, shuffle=True, test_size=0.2, stratify=relations_df["label"], random_state=42)

features = datasets.Features({'text': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(id2label), names=list(id2label.values()))})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 2494
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 624
 }))

In [5]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 6    794
3    707
7    603
4    147
0    133
2     50
1     41
5     19
Name: label, dtype: int64
val_df labels: 6    199
3    177
7    151
4     37
0     33
2     12
1     10
5      5
Name: label, dtype: int64


In [6]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="re",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=label2id,
    id2label=id2label
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [7]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Sample 456 of the training set: {'text': '#. SOB : Most consistent with [[ CHF ]] by << exam >> , although unknown what the severity of her COPD is currently .', 'label': 6, 'input_ids': [101, 3057, 211, 669, 30125, 864, 817, 2670, 188, 268, 268, 286, 30120, 1914, 1914, 224, 957, 957, 727, 1359, 1359, 430, 2712, 4715, 2174, 111, 6161, 125, 2426, 2930, 30116, 163, 4549, 211, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 102 of the training set: {'text': 'There was also << scattered T2 hyperintense foci within the periventricular and left frontal white matter >> consistent with small vessel disease , gliosis or [[ demyelination ]] .', 'label': 7, 'input_ids': [101, 619, 253, 498, 957, 957, 14984, 104, 30130, 2071, 2550, 2883, 18685, 1057, 111, 12538, 2062, 784, 463, 136, 2

In [8]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [9]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [10]:
model_name_or_path

'allenai/scibert_scivocab_cased'

In [11]:
# address class imbalance 
import torch
from torch import nn
from transformers import Trainer

class_weights = [len(train_df)/ (len(train_df[train_df["label"] == i])*len(id2label)) for i in id2label.keys()]

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [12]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-re-3"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [13]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2494
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 390


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,2.0239,1.987269,0.202685,0.259895,0.256465,0.360577
2,1.6918,1.722281,0.384011,0.448276,0.461442,0.528846
3,1.2384,1.402887,0.471159,0.470497,0.566093,0.634615
4,1.1928,1.125721,0.529039,0.518603,0.595734,0.761218
5,0.7755,0.951569,0.630514,0.611848,0.685026,0.80609
6,0.9967,0.840776,0.66762,0.630946,0.750389,0.81891
7,0.5697,0.784796,0.693828,0.662264,0.761788,0.838141
8,0.6988,0.728013,0.690877,0.647522,0.776871,0.836538
9,0.569,0.703789,0.701365,0.661283,0.778679,0.846154
10,0.3924,0.697619,0.715987,0.673304,0.801713,0.849359


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 624
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 624
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 624
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 624
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassifi

In [14]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert_scivocab_cased-re-3
Configuration saved in models/scibert_scivocab_cased-re-3/config.json
Model weights saved in models/scibert_scivocab_cased-re-3/pytorch_model.bin
tokenizer config file saved in models/scibert_scivocab_cased-re-3/tokenizer_config.json
Special tokens file saved in models/scibert_scivocab_cased-re-3/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  total_flos               =  1795207GF
  train_loss               =     1.0407
  train_runtime            = 0:06:33.79
  train_samples            =       2494
  train_samples_per_second =     63.333
  train_steps_per_second   =       0.99


In [15]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 624
  Batch size = 64


*** Evaluate ***


***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.8494
  eval_f1                 =      0.716
  eval_loss               =     0.6976
  eval_precision          =     0.6733
  eval_recall             =     0.8017
  eval_runtime            = 0:00:03.28
  eval_samples            =        624
  eval_samples_per_second =    190.065
  eval_steps_per_second   =      3.046


In [16]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 624
  Batch size = 64


              precision    recall  f1-score   support

        TeCP       0.52      0.67      0.59        33
        TrIP       0.35      0.80      0.48        10
       TrNAP       0.47      0.58      0.52        12
        TrAP       0.93      0.82      0.87       177
        TrCP       0.71      0.73      0.72        37
        TrWP       0.56      1.00      0.71         5
        TeRP       0.93      0.87      0.90       199
         PIP       0.92      0.94      0.93       151

    accuracy                           0.85       624
   macro avg       0.67      0.80      0.72       624
weighted avg       0.87      0.85      0.86       624



In [17]:
predictions, labels, _ = trainer.predict(train_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2494
  Batch size = 64


              precision    recall  f1-score   support

        TeCP       0.61      0.91      0.73       133
        TrIP       0.62      0.98      0.76        41
       TrNAP       0.76      1.00      0.86        50
        TrAP       0.98      0.89      0.93       707
        TrCP       0.83      0.90      0.86       147
        TrWP       0.66      1.00      0.79        19
        TeRP       0.98      0.90      0.94       794
         PIP       0.97      0.98      0.97       603

    accuracy                           0.92      2494
   macro avg       0.80      0.94      0.86      2494
weighted avg       0.94      0.92      0.92      2494



## Evaluate the model

In [151]:
# Local model
label_list = ['TeCP', 'TrIP', 'TrNAP', 'TrAP', 'TrCP', 'TrWP', 'TeRP', 'PIP']

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

# model_checkpoint = f"models/{model_folder_name}"
# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, label2id=label2id, id2label=id2label)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer)


In [27]:
concept_df = pd.DataFrame(concept_dict).drop(columns=["end_line"])
test_concept_df = concept_df[concept_df["concept_type"] == "test"]
problem_concept_df = concept_df[concept_df["concept_type"] == "problem"]
treatment_concept_df = concept_df[concept_df["concept_type"] == "treatment"]

# class test --> problem
test_problem_df = pd.merge(test_concept_df, problem_concept_df, how="inner", on="start_line")
test_problem_df

Unnamed: 0,concept_text_x,start_line,start_word_number_x,end_word_number_x,concept_type_x,concept_text_y,start_word_number_y,end_word_number_y,concept_type_y
0,cardiac cath,5,2,3,test,vt,0,0,problem
1,cath,94,16,16,test,vt,11,11,problem
2,monitor,104,10,10,test,vt,8,8,problem
3,exam,106,0,0,test,b / l carotid bruits,3,7,problem
4,exam,106,0,0,test,soft murmur at l sternal border,9,14,problem
...,...,...,...,...,...,...,...,...,...
61,adenosine stress mibi,132,0,2,test,a 70 % lesion in the svg to pda,22,29,problem
62,occasional ventricular extopy,133,2,4,test,ventricular tachycardia,11,12,problem
63,telemetry,133,5,5,test,ventricular tachycardia,11,12,problem
64,his am fasting glucose levels,134,4,8,test,intermittently elevated,10,11,problem


In [28]:
# class treatment --> problem
treatment_problem_df = pd.merge(treatment_concept_df, problem_concept_df, how="inner", on="start_line")
treatment_problem_df

Unnamed: 0,concept_text_x,start_line,start_word_number_x,end_word_number_x,concept_type_x,concept_text_y,start_word_number_y,end_word_number_y,concept_type_y
0,stent,5,5,5,treatment,vt,0,0,problem
1,amp,5,7,7,treatment,vt,0,0,problem
2,amio loading,5,9,10,treatment,vt,0,0,problem
3,cardiac,94,15,15,treatment,vt,11,11,problem
4,stent,94,18,18,treatment,vt,11,11,problem
...,...,...,...,...,...,...,...,...,...
104,coronary catheterisation,132,16,17,treatment,a small reversible defect in the pda territory,4,11,problem
105,coronary catheterisation,132,16,17,treatment,a 70 % lesion in the svg to pda,22,29,problem
106,cypher,132,31,31,treatment,a small reversible defect in the pda territory,4,11,problem
107,cypher,132,31,31,treatment,a 70 % lesion in the svg to pda,22,29,problem


In [29]:
# class problem --> problem
problem_problem_df = pd.merge(problem_concept_df, problem_concept_df, how="inner", on="start_line")
problem_problem_df = problem_problem_df[problem_problem_df["concept_text_x"] != problem_problem_df["concept_text_y"]]
problem_problem_df

Unnamed: 0,concept_text_x,start_line,start_word_number_x,end_word_number_x,concept_type_x,concept_text_y,start_word_number_y,end_word_number_y,concept_type_y
10,house,87,0,0,problem,low chol,2,3,problem
11,house,87,0,0,problem,low sat,5,6,problem
12,house,87,0,0,problem,fat,7,7,problem
13,low chol,87,2,3,problem,house,0,0,problem
15,low chol,87,2,3,problem,low sat,5,6,problem
...,...,...,...,...,...,...,...,...,...
113,vt,119,73,73,problem,actively ischemic,46,47,problem
114,vt,119,73,73,problem,signs,66,66,problem
115,vt,119,73,73,problem,ischemia,68,68,problem
121,a small reversible defect in the pda territory,132,4,11,problem,a 70 % lesion in the svg to pda,22,29,problem


In [30]:
    rel_df = pd.concat([test_problem_df, treatment_problem_df, problem_problem_df], axis=0)
rel_df

Unnamed: 0,concept_text_x,start_line,start_word_number_x,end_word_number_x,concept_type_x,concept_text_y,start_word_number_y,end_word_number_y,concept_type_y
0,cardiac cath,5,2,3,test,vt,0,0,problem
1,cath,94,16,16,test,vt,11,11,problem
2,monitor,104,10,10,test,vt,8,8,problem
3,exam,106,0,0,test,b / l carotid bruits,3,7,problem
4,exam,106,0,0,test,soft murmur at l sternal border,9,14,problem
...,...,...,...,...,...,...,...,...,...
113,vt,119,73,73,problem,actively ischemic,46,47,problem
114,vt,119,73,73,problem,signs,66,66,problem
115,vt,119,73,73,problem,ischemia,68,68,problem
121,a small reversible defect in the pda territory,132,4,11,problem,a 70 % lesion in the svg to pda,22,29,problem


In [34]:
text_files = glob.glob(val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
# df = df[df.text != ""]
# # reset index
# df = df.reset_index(drop=True)

# add concepts
rel_df = pd.DataFrame()
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(val_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    
    concept_df = pd.DataFrame(concept_dict).drop(columns=["end_line"])
    test_concept_df = concept_df[concept_df["concept_type"] == "test"]
    problem_concept_df = concept_df[concept_df["concept_type"] == "problem"]
    treatment_concept_df = concept_df[concept_df["concept_type"] == "treatment"]

    # class test --> problem
    test_problem_df = pd.merge(test_concept_df, problem_concept_df, how="inner", on="start_line")

    # class treatment --> problem
    treatment_problem_df = pd.merge(treatment_concept_df, problem_concept_df, how="inner", on="start_line")

    # class problem --> problem
    problem_problem_df = pd.merge(problem_concept_df, problem_concept_df, how="inner", on="start_line")
    problem_problem_df = problem_problem_df[problem_problem_df["concept_text_x"] != problem_problem_df["concept_text_y"]] # TODO: remove duplicates ?

    tmp = pd.concat([test_problem_df, treatment_problem_df, problem_problem_df], axis=0)
    tmp["filename"] = fname
    rel_df = rel_df.append(tmp, ignore_index=True)
            
rel_df = rel_df.sort_values(by=["filename", "start_line"])
rel_df = rel_df.reset_index(drop=True)
# # concept_df = concept_df.dropna(subset=["concept_text"])
# df = concept_df[["filename", "line_number", "text", "concept_text"]]
# df.rename(columns={"text":"sentence1", "concept_text":"sentence2"}, inplace=True)
# df

100%|██████████| 128/128 [00:00<00:00, 1048.61it/s]
100%|██████████| 128/128 [00:01<00:00, 88.09it/s]


In [35]:
rel_df

Unnamed: 0,concept_text_x,start_line,start_word_number_x,end_word_number_x,concept_type_x,concept_text_y,start_word_number_y,end_word_number_y,concept_type_y,filename
0,cardiac cath,5,2,3,test,vt,0,0,problem,0006
1,stent,5,5,5,treatment,vt,0,0,problem,0006
2,amp,5,7,7,treatment,vt,0,0,problem,0006
3,amio loading,5,9,10,treatment,vt,0,0,problem,0006
4,house,87,0,0,problem,low chol,2,3,problem,0006
...,...,...,...,...,...,...,...,...,...,...
14330,the serum albumin gradient,109,0,3,test,portal hypertension,8,9,problem,0475
14331,her platelet count,112,0,2,test,low,5,5,problem,0475
14332,cimetidine,112,15,15,treatment,low,5,5,problem,0475
14333,beta blocker,120,9,10,treatment,a persistent wheeze,5,7,problem,0475


In [37]:
rel_df = rel_df[[ "filename", "start_line", "concept_text_x", "concept_text_y", "concept_type_x", "concept_type_y", "start_word_number_x", "end_word_number_x", "start_word_number_y", "end_word_number_y"]]
rel_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y
0,0006,5,cardiac cath,vt,test,problem,2,3,0,0
1,0006,5,stent,vt,treatment,problem,5,5,0,0
2,0006,5,amp,vt,treatment,problem,7,7,0,0
3,0006,5,amio loading,vt,treatment,problem,9,10,0,0
4,0006,87,house,low chol,problem,problem,0,0,2,3
...,...,...,...,...,...,...,...,...,...,...
14330,0475,109,the serum albumin gradient,portal hypertension,test,problem,0,3,8,9
14331,0475,112,her platelet count,low,test,problem,0,2,5,5
14332,0475,112,cimetidine,low,treatment,problem,15,15,5,5
14333,0475,120,beta blocker,a persistent wheeze,treatment,problem,9,10,5,7


In [49]:
# make predict dataset
def preprocess_text(row):
    # find line
    line =  df[(df["filename"] == row["filename"]) & (df["line_number"] == row["start_line"]-1)]["text"].values[0]
    # line = line.lower()
    line = " ".join(line.split()) # remove multiple spaces

    concept_text_x = "<< "+ " ".join(line.split()[row["start_word_number_x"]:row["end_word_number_x"]+1]) + " >>"
    concept_text_y = "[[ " + " ".join(line.split()[row["start_word_number_y"]:row["end_word_number_y"]+1]) + " ]]"
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]

    if row["start_word_number_x"] > row["start_word_number_y"]:
        concept_text_x, concept_text_y = concept_text_y, concept_text_x
        start_word_number_x, start_word_number_y = start_word_number_y, start_word_number_x
        end_word_number_x, end_word_number_y = end_word_number_y, end_word_number_x
    text = " ".join(line.split()[: start_word_number_x] + [concept_text_x] + line.split()[end_word_number_x+1: start_word_number_y] + [concept_text_y] + line.split()[end_word_number_y+1:])

    row["text"] = text
    return row

predict_df = rel_df.apply(preprocess_text, axis=1)
predict_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,text
0,0006,5,cardiac cath,vt,test,problem,2,3,0,0,"[[ VT ]] s/p << cardiac cath >> , stent and am..."
1,0006,5,stent,vt,treatment,problem,5,5,0,0,"[[ VT ]] s/p cardiac cath , << stent >> and am..."
2,0006,5,amp,vt,treatment,problem,7,7,0,0,"[[ VT ]] s/p cardiac cath , stent and << amp >..."
3,0006,5,amio loading,vt,treatment,problem,9,10,0,0,"[[ VT ]] s/p cardiac cath , stent and amp ; <<..."
4,0006,87,house,low chol,problem,problem,0,0,2,3,<< House >> / [[ Low chol ]] / low sat. fat
...,...,...,...,...,...,...,...,...,...,...,...
14330,0475,109,the serum albumin gradient,portal hypertension,test,problem,0,3,8,9,<< The serum albumin gradient >> was 1.8 consi...
14331,0475,112,her platelet count,low,test,problem,0,2,5,5,<< Her platelet count >> stayed persistently [...
14332,0475,112,cimetidine,low,treatment,problem,15,15,5,5,Her platelet count stayed persistently [[ low ...
14333,0475,120,beta blocker,a persistent wheeze,treatment,problem,9,10,5,7,The patient however complained of [[ a persist...


In [50]:
predict_dataset = Dataset.from_pandas(predict_df, preserve_index=False)
# predict_dataset = predict_dataset.select(range(10))
predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on prediction dataset",
            )
predict_dataset

Running tokenizer on prediction dataset:   0%|          | 0/15 [00:00<?, ?ba/s]

Dataset({
    features: ['filename', 'start_line', 'concept_text_x', 'concept_text_y', 'concept_type_x', 'concept_type_y', 'start_word_number_x', 'end_word_number_x', 'start_word_number_y', 'end_word_number_y', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 14335
})

In [51]:
 predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
 predictions = np.argmax(predictions, axis=1)
 len(predictions)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: start_word_number_y, concept_type_y, concept_text_y, end_word_number_x, filename, text, start_line, concept_type_x, concept_text_x, end_word_number_y, start_word_number_x.
***** Running Prediction *****
  Num examples = 14335
  Batch size = 64


14335

In [52]:
rel_df["prediction"] = [id2label[label] for label in predictions]
rel_df

Unnamed: 0,filename,start_line,concept_text_x,concept_text_y,concept_type_x,concept_type_y,start_word_number_x,end_word_number_x,start_word_number_y,end_word_number_y,prediction
0,0006,5,cardiac cath,vt,test,problem,2,3,0,0,TrAP
1,0006,5,stent,vt,treatment,problem,5,5,0,0,TrAP
2,0006,5,amp,vt,treatment,problem,7,7,0,0,TrAP
3,0006,5,amio loading,vt,treatment,problem,9,10,0,0,TrAP
4,0006,87,house,low chol,problem,problem,0,0,2,3,TrAP
...,...,...,...,...,...,...,...,...,...,...,...
14330,0475,109,the serum albumin gradient,portal hypertension,test,problem,0,3,8,9,TeRP
14331,0475,112,her platelet count,low,test,problem,0,2,5,5,TeRP
14332,0475,112,cimetidine,low,treatment,problem,15,15,5,5,TrCP
14333,0475,120,beta blocker,a persistent wheeze,treatment,problem,9,10,5,7,TrAP


In [70]:
# for each file create <filename>.con
os.makedirs(val_data_path + os.sep + rel_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(val_data_path + os.sep + rel_folder_name + os.sep + "*.rel")
for file in files:
    os.remove(file)

for i, row in tqdm(rel_df.iterrows()):
    filename = row["filename"]
    concept_text_x = row["concept_text_x"]
    concept_text_y = row["concept_text_y"]
    concept_type_x = row["concept_type_x"]
    concept_type_y = row["concept_type_y"]
    start_word_number_x = row["start_word_number_x"]
    end_word_number_x = row["end_word_number_x"]
    start_word_number_y = row["start_word_number_y"]
    end_word_number_y = row["end_word_number_y"]
    line_number = row["start_line"]
    prediction = row["prediction"]
    with open(val_data_path + os.sep + rel_folder_name + os.sep + filename + ".rel", "a") as f:
        # fill like this c="pefusion imaging" 19:6 19:7||r="TeRP"||c="perfusion defects" 19:12 19:13
        f.write(
            f"c=\"{concept_text_x}\" {line_number}:{start_word_number_x} {line_number}:{end_word_number_x}||r=\"{prediction}\"||c=\"{concept_text_y}\" {line_number}:{start_word_number_y} {line_number}:{end_word_number_y}\n"
        )
    


14335it [00:01, 10319.49it/s]


In [71]:
!cat data/val/rel/0006.rel

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
c="cardiac cath" 5:2 5:3||r="TrAP"||c="vt" 5:0 5:0
c="stent" 5:5 5:5||r="TrAP"||c="vt" 5:0 5:0
c="amp" 5:7 5:7||r="TrAP"||c="vt" 5:0 5:0
c="amio loading" 5:9 5:10||r="TrAP"||c="vt" 5:0 5:0
c="house" 87:0 87:0||r="TrAP"||c="low chol" 87:2 87:3
c="house" 87:0 87:0||r="TeCP"||c="low sat" 87:5 87:6
c="house" 87:0 87:0||r="TrAP"||c="fat" 87:7 87:7
c="low chol" 87:2 87:3||r="TrAP"||c="house" 87:0 87:0
c="low chol" 87:2 87:3||r="PIP"||c="low sat" 87:5 87:6
c="low chol" 87:2 87:3||r="PIP"||c="fat" 87:7 87:7
c="low sat" 87:5 87:6||r="TrAP"||c="house" 87:0 87:0
c="low sat" 87:5 87:6||r="TrAP"||c="low chol" 87:2 87:3
c="low sat" 87:5 87:6||r="PIP"||c="fat" 87:7 87:7
c="fat" 87:7 87:7||r="TrAP"||c="house" 87:0 87:0
c="f

In [72]:
!zip -r scibert-val-rel.zip data/val/rel/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: data/val/rel/ (stored 0%)
  adding: data/val/rel/0309.rel (deflated 82%)
  adding: data/val/rel/0085.rel (deflated 68%)
  adding: data/val/rel/0101.rel (deflated 88%)
  adding: data/val/rel/0382.rel (deflated 65%)
  adding: data/val/rel/0282.rel (deflated 73%)
  adding: data/val/rel/0194.rel (deflated 87%)
  adding: data/val/rel/0161.rel (deflated 86%)
  adding: data/val/rel/0169.rel (deflated 81%)
  adding: data/val/rel/0049.rel (deflated 51%)
  adding: data/val/rel/0317.rel (deflated 74%)
  adding: data/val/rel/0294.rel (deflated 86%)
  adding: data/val/rel/0273.rel (deflated 84%)
  adding: data/val/rel/0369.rel (deflated 86%)
  adding: data/val/rel/0464.rel (deflated 86%)
  adding: data/val/rel/