[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Assertions classification

Based of: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_xnli.py

In [66]:
!nvidia-smi

Mon Jan 24 19:34:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [67]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [69]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/medical_txt_parser

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/medical_txt_parser


In [70]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import random

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import datasets
import numpy as np
from datasets import load_dataset, load_metric , Dataset
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from transformers import pipeline

require_version("datasets>=1.8.0", "To fix: pip install --upgrade datasets")

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [71]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
nli_data_path = "data/nli"
# os.makedirs(nli_data_path, exist_ok=True)

# model args
model_name_or_path = "gsarti/scibert-nli"
cache_dir = None
model_revision = None 
tokenizer_name = model_name_or_path
do_lower_case = True
use_fast_tokenizer = True
fp16 = True

# data args
pad_to_max_length = None
max_seq_length = None

set_seed(42)


### Import data

In [72]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        ast = parse_ast(train_data_path + os.sep + ast_folder_name + os.sep +  filename + ".ast")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename], "ast":[ast]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:19<00:00,  8.63it/s]


Unnamed: 0,text,filename,ast
0,Admission Date :\n2018-03-04\nDischarge Date :...,record-108,"{'concept_text': ['chills', 'mitochondrial myo..."
1,Admission Date:\n2011-02-08\nDischarge Date :\...,record-17,{'concept_text': ['community-acquired pneumoni...
2,Admission Date :\n2016-12-30\nDischarge Date :...,record-26,{'concept_text': ['gastroesophageal reflux dis...
3,Admission Date :\n2012-01-20\nDischarge Date :...,record-53,"{'concept_text': ['his pain', 'elevated psa', ..."
4,Admission Date :\n2013-05-18\nDischarge Date :...,record-37,"{'concept_text': ['mmp', 'a hip and arm fractu..."


In [73]:
assertion_df = pd.DataFrame(columns=[ "filename"]+list(ast.keys()))
for i, file in df.iterrows():
    assertion_dict = file["ast"]
    tmp = pd.DataFrame(assertion_dict)
    tmp["filename"] = file["filename"]
    assertion_df = assertion_df.append(tmp, ignore_index=True)
assertion_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type,assertion_type
0,record-108,chills,96,30,96,30,problem,hypothetical
1,record-108,mitochondrial myopathy,27,1,27,2,problem,present
2,record-108,chronic prostatitis,20,10,20,11,problem,present
3,record-108,supraventricular tachycardia,26,1,26,2,problem,present
4,record-108,fevers,96,24,96,24,problem,hypothetical


### Dataset Preprocessing

In [74]:
def preprocess_text(row):
    text = row["text"]
    # find line
    text = text.lower()
    text = text.split("\n")[row["start_line"]-1]
    row["text"] = text
    return row
ast2label = {ast:i for i,ast in enumerate(assertion_df["assertion_type"].unique())}
label2ast = {i:ast for ast,i in ast2label.items()}

assertion_df = assertion_df.merge(df[["filename","text"]], on="filename", how="inner")
assertion_df = assertion_df.apply(preprocess_text, axis=1)
assertion_df = assertion_df.drop(columns=["filename", "start_word_number", "end_line", "end_word_number", "concept_type", "start_line"])
assertion_df["label"] = assertion_df["assertion_type"].apply(lambda x: ast2label[x])
assertion_df = assertion_df.rename(columns={"text": "sentence1", "concept_text": "sentence2"})
assertion_df = assertion_df[["sentence1", "sentence2", "label", "assertion_type"]]
assertion_df


Unnamed: 0,sentence1,sentence2,label,assertion_type
0,go to an emergency room if you experience symp...,chills,0,hypothetical
1,3. mitochondrial myopathy .,mitochondrial myopathy,1,present
2,justin searle a 60-year-old man with a long hi...,chronic prostatitis,1,present
3,2. supraventricular tachycardia ( on a beta bl...,supraventricular tachycardia,1,present
4,go to an emergency room if you experience symp...,fevers,0,hypothetical
...,...,...,...,...
7068,abdominal pain treatment rendered :,abdominal pain,1,present
7069,"if you have continued fevers , worsening abdom...",discharge,0,hypothetical
7070,the patient was given printed instructions for...,abdominal pain,1,present
7071,abd pain/ pelvic pain,abd pain/,1,present


In [84]:
# Build HuggingFace Dataset

train_df, val_df = train_test_split(assertion_df, train_size=None, shuffle=True, test_size=0.2, stratify=assertion_df["label"], random_state=42)

features = datasets.Features({'sentence1': datasets.Value(dtype='string'),
 'sentence2': datasets.Value(dtype='string'),
 'label': datasets.ClassLabel(num_classes=len(label2ast), names=list(label2ast.values())),
 'assertion_type': datasets.Value(dtype='string')})

train_dataset = Dataset.from_pandas(train_df, preserve_index=False, features=features)
eval_dataset = Dataset.from_pandas(val_df, preserve_index=False, features=features)

label_list = train_dataset.features["label"].names
num_labels = len(label_list)

label_list = train_dataset.features["label"].names
train_dataset, eval_dataset

(Dataset({
     features: ['sentence1', 'sentence2', 'label', 'assertion_type'],
     num_rows: 5658
 }), Dataset({
     features: ['sentence1', 'sentence2', 'label', 'assertion_type'],
     num_rows: 1415
 }))

In [85]:
# check labels balance
print(f"train_df labels: {train_df['label'].value_counts()}")
print(f"val_df labels: {val_df['label'].value_counts()}")

train_df labels: 1    3699
3    1277
0     306
5     247
4      71
2      58
Name: label, dtype: int64
val_df labels: 1    925
3    319
0     76
5     62
4     18
2     15
Name: label, dtype: int64


In [86]:
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    finetuning_task="xnli",
    cache_dir=cache_dir,
    revision=model_revision,
    label2id=ast2label,
    id2label=label2ast
)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=use_fast_tokenizer,
    revision=model_revision,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
    revision=model_revision,
)

loading configuration file https://huggingface.co/gsarti/scibert-nli/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/d1a718a92956f71237b03954f057420eda133c6f2f5980416169abae733cd0c5.3e1114112ea2a7e54342079530b1a728c8757051c6ab0d54ff742f9f92c549a4
Model config BertConfig {
  "_name_or_path": "gsarti/scibert-nli",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "xnli",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "hypothetical",
    "1": "present",
    "2": "conditional",
    "3": "absent",
    "4": "associated_with_someone_else",
    "5": "possible"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "absent": 3,
    "associated_with_someone_else": 4,
    "conditional": 2,
    "hypothetical": 0,
    "possible": 5,
    "present": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings":

In [87]:
# Preprocessing the datasets
# Padding strategy
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False


def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding=padding,
        max_length=max_seq_length,
        truncation=True,
    )


train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.\n")


Running tokenizer on train dataset:   0%|          | 0/6 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running tokenizer on validation dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Sample 5238 of the training set: {'sentence1': '* leukocytosis- she developed mild elevated wbc .', 'sentence2': 'mild elevated wbc', 'label': 1, 'assertion_type': 'present', 'input_ids': [101, 1360, 5939, 11000, 578, 2877, 1826, 6902, 5630, 127, 12479, 211, 102, 6902, 5630, 127, 12479, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Sample 912 of the training set: {'sentence1': 'caval narrowing is seen , most likely correlation to clinical history is recommended and further evaluation of the ct or short-term ultrasounds for follow-up was recommended .', 'sentence2': 'caval narrowing', 'label': 1, 'assertion_type': 'present', 'input_ids': [101, 15396, 118, 27857, 163, 2199, 430, 817, 2002, 2481, 146, 1551, 3290, 163, 6375, 136, 1419, 2528, 125, 111, 7314, 251, 2009, 578, 953, 9265, 30111, 173, 609, 578, 813, 253, 6375, 211, 102, 15396, 118, 27857, 102], 'token_type_ids': [0, 0, 0

In [88]:
# Get the metric function
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
accuracy_metric = load_metric("accuracy")

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    metrics = {}
    metrics.update(f1_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(precision_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(recall_metric.compute(predictions=preds, references=p.label_ids, average="macro"))
    metrics.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    return metrics

In [89]:
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if pad_to_max_length:
    data_collator = default_data_collator
elif fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [90]:
model_name_or_path

'gsarti/scibert-nli'

In [93]:
# Initialize our Trainer
model_folder_name = f"{model_name_or_path.split('/')[-1]}-ast-clf-1"
args = TrainingArguments(
    f"training_logs/{model_folder_name}",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_steps=1,
    warmup_ratio=0.1,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset ,
    eval_dataset=eval_dataset ,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [94]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running training *****
  Num examples = 5658
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3540


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.6531,0.402695,0.44632,0.428518,0.465853,0.879859
2,0.4639,0.226583,0.746434,0.76403,0.73472,0.938516
3,0.1021,0.243494,0.77604,0.829466,0.778331,0.94629
4,0.0029,0.266176,0.819718,0.844744,0.806945,0.94629
5,0.0017,0.272562,0.829495,0.877423,0.800158,0.946996
6,0.0036,0.268577,0.854552,0.886904,0.834589,0.951237
7,0.0016,0.296083,0.866854,0.906276,0.837046,0.954064
8,0.0011,0.302734,0.862419,0.874072,0.852092,0.95053
9,0.0007,0.304969,0.850962,0.863605,0.840801,0.949117
10,0.0005,0.310724,0.865146,0.883138,0.849242,0.95053


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running Evaluation *****
  Num examples = 1415
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running Evaluation *****
  Num examples = 1415
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running Evaluation *****
  Num examples = 1415
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running Evaluation *****
  Num examples =

In [95]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_model(f"models/{model_folder_name}")  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

Saving model checkpoint to models/scibert-nli-ast-clf-1
Configuration saved in models/scibert-nli-ast-clf-1/config.json
Model weights saved in models/scibert-nli-ast-clf-1/pytorch_model.bin
tokenizer config file saved in models/scibert-nli-ast-clf-1/tokenizer_config.json
Special tokens file saved in models/scibert-nli-ast-clf-1/special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  total_flos               =  2102713GF
  train_loss               =     0.1433
  train_runtime            = 0:16:13.02
  train_samples            =       5658
  train_samples_per_second =     58.148
  train_steps_per_second   =      3.638


In [96]:
print("*** Evaluate ***") 
metrics = trainer.evaluate(eval_dataset=eval_dataset)

metrics["eval_samples"] = len(eval_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running Evaluation *****
  Num examples = 1415
  Batch size = 16


*** Evaluate ***


***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.9505
  eval_f1                 =     0.8651
  eval_loss               =     0.3107
  eval_precision          =     0.8831
  eval_recall             =     0.8492
  eval_runtime            = 0:00:07.62
  eval_samples            =       1415
  eval_samples_per_second =    185.689
  eval_steps_per_second   =     11.679


In [97]:
predictions, labels, _ = trainer.predict(eval_dataset, metric_key_prefix="predict")
predictions = np.argmax(predictions, axis=1)
print(classification_report(labels, predictions,target_names=label_list))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, assertion_type, sentence2.
***** Running Prediction *****
  Num examples = 1415
  Batch size = 16


                              precision    recall  f1-score   support

                hypothetical       0.93      0.93      0.93        76
                     present       0.96      0.97      0.97       925
                 conditional       0.62      0.53      0.57        15
                      absent       0.95      0.95      0.95       319
associated_with_someone_else       1.00      0.89      0.94        18
                    possible       0.84      0.82      0.83        62

                    accuracy                           0.95      1415
                   macro avg       0.88      0.85      0.87      1415
                weighted avg       0.95      0.95      0.95      1415



## Evaluate the model

In [None]:
# Local model
label_list = ['present',
 'possible',
 'absent',
 'conditional',
 'hypothetical',
 'associated_with_someone_else']

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

# model_checkpoint = f"models/{model_folder_name}"
# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, label2id=label2id, id2label=id2label)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# effect_ner_model = pipeline(task="ner", model=model, tokenizer=tokenizer)


In [None]:
text_files = glob.glob(val_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        # split lines
        lines = text.split('\n')
        filename =[ file.split("/")[-1].split(".")[0]] * len(lines)
        df = df.append(pd.DataFrame({"text": lines, "filename": filename, "line_number": range(len(lines))}), ignore_index=True)

df = df.sort_values(by=["filename", "line_number"])
# remove empty text lines
# df = df[df.text != ""]
# # reset index
# df = df.reset_index(drop=True)

# add concepts
concept_df = []
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(val_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    for concept_text, start_line, start_word_number, end_line, end_word_number, concept_type in zip(*list(concept_dict.values())):
        if concept_type == "problem":
            text = df[(df["filename"] == fname) & (df["line_number"] == start_line-1)].text.values[0]
            concept_df.append({"filename": fname, "concept_text": concept_text, "text": text, "line_number":start_line})
            
concept_df = pd.DataFrame(concept_df)
# concept_df = concept_df.dropna(subset=["concept_text"])
df = concept_df[["filename", "line_number", "text", "concept_text"]]
df.rename(columns={"text":"sentence1", "concept_text":"sentence2"}, inplace=True)
df

100%|██████████| 128/128 [00:00<00:00, 887.35it/s]
100%|██████████| 128/128 [00:08<00:00, 14.29it/s]


Unnamed: 0,filename,line_number,sentence1,sentence2
0,0006,2,NVH,nvh
1,0006,5,"VT s/p cardiac cath , stent and amp ; amio loa...",vt
2,0006,33,SERIOUS INTERACTION :,serious interaction
3,0006,39,SERIOUS INTERACTION :,serious interaction
4,0006,45,SERIOUS INTERACTION :,serious interaction
...,...,...,...,...
6548,0475,105,She did very well over the course of her admis...,further bleeding
6549,0475,109,The serum albumin gradient was 1.8 consistent ...,portal hypertension
6550,0475,111,She therefore _________ criteria for a spontan...,a spontaneous bacterial peritonitis
6551,0475,112,Her platelet count stayed persistently low and...,low


In [None]:
predict_dataset = Dataset.from_pandas(df, preserve_index=False)
# predict_dataset = predict_dataset.select(range(10))
predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on prediction dataset",
            )
predict_dataset

Running tokenizer on prediction dataset:   0%|          | 0/7 [00:00<?, ?ba/s]

Dataset({
    features: ['filename', 'line_number', 'sentence1', 'sentence2', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6553
})

In [None]:
 predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
 predictions = np.argmax(predictions, axis=1)
 len(predictions)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: filename, line_number, sentence1, sentence2.
***** Running Prediction *****
  Num examples = 6553
  Batch size = 2


6553

In [None]:
df["prediction"] = [label2ast[label] for label in predictions]
df

Unnamed: 0,filename,line_number,sentence1,sentence2,prediction
0,0006,2,NVH,nvh,present
1,0006,5,"VT s/p cardiac cath , stent and amp ; amio loa...",vt,present
2,0006,33,SERIOUS INTERACTION :,serious interaction,present
3,0006,39,SERIOUS INTERACTION :,serious interaction,present
4,0006,45,SERIOUS INTERACTION :,serious interaction,present
...,...,...,...,...,...
6548,0475,105,She did very well over the course of her admis...,further bleeding,absent
6549,0475,109,The serum albumin gradient was 1.8 consistent ...,portal hypertension,present
6550,0475,111,She therefore _________ criteria for a spontan...,a spontaneous bacterial peritonitis,present
6551,0475,112,Her platelet count stayed persistently low and...,low,present


In [None]:
df[df["filename"]=="0166"]

Unnamed: 0,filename,line_number,sentence1,sentence2,prediction
2161,166,30,The patient was given printed instructions for...,back pain,present
2162,166,32,back pain,back pain,present
2163,166,36,ibuoprofen 800 every 6-8 hrs as needed skelaxi...,severe pain,hypothetical
2164,166,40,"take medication as directed , return for sever...",severe or significantly worsening symptoms,hypothetical
2165,166,40,"take medication as directed , return for sever...",weakness,hypothetical
2166,166,40,"take medication as directed , return for sever...",loss of sensation work restrictions,hypothetical


In [None]:
import torch 
# empty cuda memory
torch.cuda.empty_cache()

In [None]:
# count predictions
df["prediction"].value_counts()

present                         4580
absent                          1414
possible                         262
hypothetical                     194
associated_with_someone_else      85
conditional                       18
Name: prediction, dtype: int64

In [None]:
final_df = df.copy()

In [None]:
# add concepts
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(val_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    for concept_text, start_line, start_word_number, end_line, end_word_number, concept_type in zip(*list(concept_dict.values())):
        if concept_type == "problem":
            df.loc[(df["line_number"] == start_line) & (df["filename"] == fname), "start_word_number"] = int(start_word_number)
            df.loc[(df["line_number"] == start_line) & (df["filename"] == fname), "end_word_number"] = int(end_word_number)
            df.loc[(df["line_number"] == start_line) & (df["filename"] == fname), "concept_type"] = concept_type
# change start_word_number to int type
df["start_word_number"] = df["start_word_number"].astype(int)
df["end_word_number"] = df["end_word_number"].astype(int)
df

100%|██████████| 127/127 [00:20<00:00,  6.32it/s]


Unnamed: 0,filename,line_number,sentence1,sentence2,prediction,start_word_number,end_word_number,concept_type
0,0006,2,NVH,nvh,present,0,0,problem
1,0006,5,"VT s/p cardiac cath , stent and amp ; amio loa...",vt,present,0,0,problem
2,0006,33,SERIOUS INTERACTION :,serious interaction,present,0,1,problem
3,0006,39,SERIOUS INTERACTION :,serious interaction,present,0,1,problem
4,0006,45,SERIOUS INTERACTION :,serious interaction,present,0,1,problem
...,...,...,...,...,...,...,...,...
6548,0475,105,She did very well over the course of her admis...,further bleeding,absent,14,15,problem
6549,0475,109,The serum albumin gradient was 1.8 consistent ...,portal hypertension,present,8,9,problem
6550,0475,111,She therefore _________ criteria for a spontan...,a spontaneous bacterial peritonitis,present,5,8,problem
6551,0475,112,Her platelet count stayed persistently low and...,low,present,5,5,problem


In [None]:
# for each file create <filename>.con
os.makedirs(val_data_path + os.sep + ast_folder_name, exist_ok=True)
# empty folder if exists
files = glob.glob(val_data_path + os.sep + ast_folder_name + os.sep + "*.con")
for file in files:
    os.remove(file)

for i, row in tqdm(df.iterrows()):
    filename = row["filename"]
    concept = row["sentence2"]
    line_num = row["line_number"]
    start_word_idx = row["start_word_number"]
    end_word_idx = row["end_word_number"]
    concept_type = row["concept_type"]
    pred_assertion = row["prediction"]
    with open(val_data_path + os.sep + ast_folder_name + os.sep + filename + ".ast", "a") as f:
        # fill like this c="bleeding risk" 126:11 126:12||t="problem"||a="present"
        f.write(
            f"c=\"{concept}\" {line_num}:{start_word_idx} {line_num}:{end_word_idx}||t=\"{concept_type}\"||a=\"{pred_assertion}\"\n"
        )
    


6553it [00:00, 9505.26it/s]


In [None]:
!zip -r scibert-val-ast.zip data/val/ast/

  adding: data/val/ast/ (stored 0%)
  adding: data/val/ast/0381.ast (deflated 75%)
  adding: data/val/ast/0467.ast (deflated 76%)
  adding: data/val/ast/0018.ast (deflated 78%)
  adding: data/val/ast/0206.ast (deflated 74%)
  adding: data/val/ast/0469.ast (deflated 73%)
  adding: data/val/ast/0273.ast (deflated 74%)
  adding: data/val/ast/0472.ast (deflated 74%)
  adding: data/val/ast/0342.ast (deflated 73%)
  adding: data/val/ast/0070.ast (deflated 70%)
  adding: data/val/ast/0066.ast (deflated 53%)
  adding: data/val/ast/0377.ast (deflated 74%)
  adding: data/val/ast/0222.ast (deflated 67%)
  adding: data/val/ast/0345.ast (deflated 73%)
  adding: data/val/ast/0405.ast (deflated 49%)
  adding: data/val/ast/0202.ast (deflated 70%)
  adding: data/val/ast/0474.ast (deflated 77%)
  adding: data/val/ast/0137.ast (deflated 67%)
  adding: data/val/ast/0077.ast (deflated 77%)
  adding: data/val/ast/0157.ast (deflated 60%)
  adding: data/val/ast/0278.ast (deflated 79%)
  adding: data/val/ast/0