[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Assertions classification

In [17]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

ModuleNotFoundError: No module named 'google.colab'

In [5]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [6]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
nli_data_path = "data/nli"
os.makedirs(nli_data_path, exist_ok=True)

### Import data

In [7]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        ast = parse_ast(train_data_path + os.sep + ast_folder_name + os.sep +  filename + ".ast")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept], "ast":[ast]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:00<00:00, 758.91it/s]


Unnamed: 0,text,filename,concept,ast
0,143748600 SC\n43243309\n095342\n09/30/2000 12:...,143748600_SC,"{'concept_text': ['htn', 'atypical chest pain'...","{'concept_text': ['htn', 'atypical chest pain'..."
1,Admission Date :\n2014-03-31\nDischarge Date :...,record-56,"{'concept_text': ['the procedure', 'auscultati...","{'concept_text': ['sclera mildly icteric', 'ac..."
2,699905656 SC\n06100044\n921737\n1/17/1993 12:0...,699905656_SC,{'concept_text': ['right sided sciatica proced...,{'concept_text': ['a lateral l-1 s-1 disc bulg...
3,627258104\nGH\n56900479\n1/8/2001 12:00:00 AM\...,627258104,"{'concept_text': ['bradycardia', 'subsequent f...","{'concept_text': ['bradycardia', 'subsequent f..."
4,839999049 YC\n59242403\n431924\n11/01/2000 12:...,839999049_YC,"{'concept_text': ['pauses', 'outpt fitting', '...","{'concept_text': ['pauses', 'ischemia', 'htn',..."


In [8]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,143748600_SC,htn,52,19,52,19,problem
1,143748600_SC,atypical chest pain,5,0,5,2,problem
2,143748600_SC,wellbutrin ( bupropion hcl ),33,19,33,23,treatment
3,143748600_SC,ett,58,3,58,3,test
4,143748600_SC,dissection,56,7,56,7,problem


In [9]:
assertion_df = pd.DataFrame(columns=[ "filename"]+list(ast.keys()))
for i, file in df.iterrows():
    assertion_dict = file["ast"]
    tmp = pd.DataFrame(assertion_dict)
    tmp["filename"] = file["filename"]
    assertion_df = assertion_df.append(tmp, ignore_index=True)
assertion_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type,assertion_type
0,143748600_SC,htn,52,19,52,19,problem,present
1,143748600_SC,atypical chest pain,5,0,5,2,problem,present
2,143748600_SC,dissection,56,7,56,7,problem,possible
3,143748600_SC,anxiety ( anxiety ),48,18,48,21,problem,present
4,143748600_SC,back pain,56,11,56,12,problem,present


Since there is some differences between the concept and assertion data, we will use the concept indexing and merge it with assertion type.

In [10]:
concept_df[concept_df["concept_text"] == "clear cut cord compression"].head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
2380,record-24,clear cut cord compression,13,30,13,33,problem


In [11]:
assertion_df[assertion_df["concept_text"] == "clear cut cord compression"].head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type,assertion_type
1001,record-24,clear cut cord compression,13,31,13,34,problem,present


### Dataset Preprocessing

In [12]:
# check start_line == end_line
assertion_df[assertion_df["start_line"] != assertion_df["end_line"]]

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type,assertion_type


In the following we reformat the dataset to easily label the concepts.

Note:
* We replace multiple spaces with a single space.

In [13]:
def preprocess_text(row):
    text = row["text"]
    # find line
    text = text.lower()
    text = text.split("\n")[row["start_line"]-1]
    row["text"] = text
    return row

assertion_df = assertion_df.merge(df[["filename","text"]], on="filename", how="inner")
assertion_df = assertion_df.apply(preprocess_text, axis=1)
assertion_df = assertion_df.drop(columns=["start_word_number", "end_line", "end_word_number", "concept_type", "start_line"])

# add hypothesis
# NOTE: by default assertions fall into the type "present"
label2hyp = {
    "absent": "Patient currently doesn't have {problem}",
    "possible": "Patient may have {problem}",
    "conditional": "Patient has {problem} only under certain conditions",
    "hypothetical": "Patient may develop {problem}",
    "associated_with_someone_else": "{problem} is associated with someone else who is not the patient",
}
assertion_df = assertion_df[assertion_df["assertion_type"] != "present"]
assertion_df["hypothesis"] = assertion_df.apply(lambda x: label2hyp[x["assertion_type"]].format(problem=x["concept_text"]), axis=1)
# add hy
assertion_df


Unnamed: 0,filename,concept_text,assertion_type,text,hypothesis
2,143748600_SC,dissection,possible,had chest ct to r / o dissection ( due to back...,Patient may have dissection
10,143748600_SC,mi,absent,ruled out for mi .,Patient currently doesn't have mi
13,143748600_SC,pain,absent,"here , had t wave flattening laterally and inf...",Patient currently doesn't have pain
16,143748600_SC,episodes of atypical cp x 1 week,conditional,"41 yo man with crfs of dm type ii , high chole...",Patient has episodes of atypical cp x 1 week o...
18,143748600_SC,assoicated sx,absent,no assoicated sx .,Patient currently doesn't have assoicated sx
...,...,...,...,...,...
7057,105732749,no gross evidence seating in the abdominal wall,absent,findings included no gross evidence seating in...,Patient currently doesn't have no gross eviden...
7062,105732749,subcutaneous emphysema over the abdomen,possible,"postoperatively , the patient had some abdomin...",Patient may have subcutaneous emphysema over t...
7063,105732749,murmurs,absent,regular s1 and s2 ; no murmurs .,Patient currently doesn't have murmurs
7065,105732749,palpable masses,absent,soft ; nontender ; no palpable masses .,Patient currently doesn't have palpable masses


In [116]:
#import train_test_split
from sklearn.model_selection import train_test_split

# save text and hypothesis
premises = []
hypothesis = []
labels = []
for i, row in assertion_df.iterrows():
    for label in label2hyp:
        if row["assertion_type"] == label:
            premises.append(row["text"])
            hypothesis.append(row["hypothesis"])
            labels.append("entailment")
        else:
            premises.append(row["text"])
            hypothesis.append(row["hypothesis"])
            labels.append("neutral")

# split to train val
premises_train, premises_val, hypothesis_train, hypothesis_val, labels_train, labels_val = train_test_split(premises, hypothesis, labels, test_size=0.2, random_state=42)

print("Train size: ", len(premises_train))
print("Val size: ", len(premises_val))

# save to gzip files
with gzip.open(nli_data_path + os.sep + "s1.train.gz", "wt") as f:
    f.write("\n".join(premises_train))


Train size:  9796
Val size:  2449


In [107]:
# add hypothesis
from sentence_transformers.readers import NLIDataReader
nli_reader = NLIDataReader(nli_data_path)