[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Assertions classification

In [None]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

In [1]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import re

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

/mnt/d/Google Drive/projects/medical_txt_parser/src/notebooks
/mnt/d/Google Drive/projects/medical_txt_parser/src
/mnt/d/Google Drive/projects/medical_txt_parser


In [2]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
re_data_path = "data/re"
os.makedirs(re_data_path, exist_ok=True)

### Import data

In [5]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        rel = parse_relation(train_data_path + os.sep + rel_folder_name + os.sep +  filename + ".rel")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept], "rel":[rel]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:10<00:00, 16.28it/s]


Unnamed: 0,text,filename,concept,rel
0,018636330 DH\n5425710\n123524\n0144918\n6/2/20...,018636330_DH,"{'concept_text': ['a workup', 'pain', 'microsc...","{'concept_text_1': ['po pain medications', 'a ..."
1,026350193 RWH\n7093319\n549304\n8417371\n6/5/2...,026350193_RWH,"{'concept_text': ['flexeril', 'constipation', ...","{'concept_text_1': [], 'start_line_1': [], 'st..."
2,037945397 RWH\n2690633\n194867\n151887\n10/17/...,037945397_RWH,"{'concept_text': ['ivf', 'near syncope', 'recu...","{'concept_text_1': [], 'start_line_1': [], 'st..."
3,044687343 ELMVH\n01719921\n1626859\n3/13/2006 ...,044687343_ELMVH,"{'concept_text': ['lisinopril pump', 'bipap', ...","{'concept_text_1': ['bipap', 'fatigue', 'ekg',..."
4,060376519 DH\n0649031\n323495\n3838556\n4/5/20...,060376519_DH,"{'concept_text': ['dizziness', 'benign positio...","{'concept_text_1': ['fever'], 'start_line_1': ..."


In [15]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,018636330_DH,a workup,27,2,27,3,test
1,018636330_DH,pain,55,10,55,10,problem
2,018636330_DH,microscopic anterior cervical diskectomy at c5-6,23,0,23,5,treatment
3,018636330_DH,hyperlipidemia,29,4,29,4,problem
4,018636330_DH,po pain medications,47,7,47,9,treatment
...,...,...,...,...,...,...,...
16520,record-84,nitroglycerin,59,42,59,42,treatment
16521,record-84,gallops,51,10,51,10,problem
16522,record-84,hypertension,12,30,12,30,problem
16523,record-84,auscultation,52,5,52,5,test


In [78]:
rel_df = pd.DataFrame(columns=[ "filename"]+list(rel.keys()))
for i, file in df.iterrows():
    rel_dict = file["rel"]
    tmp = pd.DataFrame(rel_dict)
    tmp["filename"] = file["filename"]
    rel_df = rel_df.append(tmp, ignore_index=True)
rel_df

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type
0,018636330_DH,po pain medications,47,7,47,9,her pain,47,0,47,1,TrIP
1,018636330_DH,a postoperative ct scan,43,2,43,5,partial decompression of the spinal canal,43,8,43,13,TeRP
2,018636330_DH,percocet,55,1,55,1,pain,55,10,55,10,TrAP
3,018636330_DH,c5-6 disc herniation,21,0,21,2,cord compression,21,4,21,5,PIP
4,018636330_DH,c5-6 disc herniation,21,0,21,2,myelopathy,21,7,21,7,PIP
...,...,...,...,...,...,...,...,...,...,...,...,...
3115,record-84,lad stent,59,34,59,35,an st elevation mi,59,28,59,31,TrAP
3116,record-84,lad stent,59,34,59,35,residuals,59,40,59,40,TrCP
3117,record-84,the first ekg,15,7,15,9,t wave inversions,15,22,15,24,TeRP
3118,record-84,the first ekg,15,7,15,9,st elevations in v1-v3,15,14,15,17,TeRP


In [79]:
# extract relations from column "relation_type"
rel_df["rel_type1"] = rel_df["relation_type"].apply(lambda x: re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', x)[0]).apply(lambda x: label_dict[x])
rel_df["rel_type2"] = "problem"

# extract realtions from concept files
merge_df = pd.merge(concept_df, rel_df, left_on=["filename","start_line","start_word_number","end_line","end_word_number"], right_on=["filename","start_line_1","start_word_number_1","end_line_1","end_word_number_1"], how="inner")
assert len(merge_df[merge_df["concept_text"] != merge_df["concept_text_1"]]) == 0, "concept_text is not equal to concept_text_1"
rel_df["concept_type_1"] = merge_df["concept_type"]

merge_df = pd.merge(concept_df, rel_df, left_on=["filename","start_line","start_word_number","end_line","end_word_number"], right_on=["filename","start_line_2","start_word_number_2","end_line_2","end_word_number_2"], how="inner")
assert len(merge_df[merge_df["concept_text"] != merge_df["concept_text_2"]]) == 0, "concept_text is not equal to concept_text_2"
rel_df["concept_type_2"] = merge_df["concept_type"]

rel_df

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2
0,018636330_DH,po pain medications,47,7,47,9,her pain,47,0,47,1,TrIP,treatment,problem,treatment,problem
1,018636330_DH,a postoperative ct scan,43,2,43,5,partial decompression of the spinal canal,43,8,43,13,TeRP,test,problem,test,problem
2,018636330_DH,percocet,55,1,55,1,pain,55,10,55,10,TrAP,treatment,problem,treatment,problem
3,018636330_DH,c5-6 disc herniation,21,0,21,2,cord compression,21,4,21,5,PIP,problem,problem,problem,problem
4,018636330_DH,c5-6 disc herniation,21,0,21,2,myelopathy,21,7,21,7,PIP,problem,problem,problem,problem
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,record-84,lad stent,59,34,59,35,an st elevation mi,59,28,59,31,TrAP,treatment,problem,treatment,problem
3116,record-84,lad stent,59,34,59,35,residuals,59,40,59,40,TrCP,treatment,problem,treatment,problem
3117,record-84,the first ekg,15,7,15,9,t wave inversions,15,22,15,24,TeRP,test,problem,test,problem
3118,record-84,the first ekg,15,7,15,9,st elevations in v1-v3,15,14,15,17,TeRP,test,problem,test,problem


Dissimilarities between concept files and relation files

In [80]:
rel_df[rel_df["concept_type_1"] != rel_df["rel_type1"]]

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2


In [81]:
rel_df[rel_df["concept_type_2"] != rel_df["rel_type2"]]

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2
1747,record-177,levofloxacin,122,18,122,18,uti,122,9,122,9,TrAP,treatment,problem,treatment,test
1820,record-18,anemia,104,17,104,17,high mcv,104,6,104,7,PIP,problem,problem,problem,treatment


In [82]:
# check same start line
rel_df[rel_df["start_line_2"] != rel_df["start_line_1"]]

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2


In [84]:
# we drop dissimilarities
rel_df.drop(rel_df[rel_df["concept_type_1"] != rel_df["rel_type1"]].index, inplace=True)
rel_df.drop(rel_df[rel_df["concept_type_2"] != rel_df["rel_type2"]].index, inplace=True)

# drop cols and rename cols
rel_df = rel_df[["filename","start_line_1", "concept_text_1", "concept_text_2", "relation_type", "concept_type_1", "concept_type_2"]]
rel_df.columns = ["filename","line_num", "concept_text_1", "concept_text_2", "rel_type", "concept_type_1", "concept_type_2"]

# add text based on filename and line_num
def preprocess_text(row):
    text = row["text"]
    # find line
    # text = text.lower()
    text = text.split("\n")[row["line_num"]-1]
    row["text"] = text
    return row

rel_df = rel_df.merge(df[["filename","text"]], on="filename", how="inner")
rel_df = rel_df.apply(preprocess_text, axis=1)
rel_df

Unnamed: 0,filename,line_num,concept_text_1,concept_text_2,rel_type,concept_type_1,concept_type_2,text
0,018636330_DH,47,po pain medications,her pain,TrIP,treatment,problem,Her pain was under good control with PO pain m...
1,018636330_DH,43,a postoperative ct scan,partial decompression of the spinal canal,TeRP,test,problem,She had a postoperative CT scan that revealed ...
2,018636330_DH,55,percocet,pain,TrAP,treatment,problem,"3. Percocet , 5/325 , 1-2 tabs PO q4-6h prn pa..."
3,018636330_DH,21,c5-6 disc herniation,cord compression,PIP,problem,problem,C5-6 disc herniation with cord compression and...
4,018636330_DH,21,c5-6 disc herniation,myelopathy,PIP,problem,problem,C5-6 disc herniation with cord compression and...
...,...,...,...,...,...,...,...,...
3113,record-84,59,lad stent,an st elevation mi,TrAP,treatment,problem,This is a 54 - year-old man with significant c...
3114,record-84,59,lad stent,residuals,TrCP,treatment,problem,This is a 54 - year-old man with significant c...
3115,record-84,15,the first ekg,t wave inversions,TeRP,test,problem,He presented to Deaconess-Nashoba Hospital Hos...
3116,record-84,15,the first ekg,st elevations in v1-v3,TeRP,test,problem,He presented to Deaconess-Nashoba Hospital Hos...


In [87]:
#export as jsonl
rel_df.to_json(re_data_path + os.sep + "re_data.jsonl", orient="records", lines=True)

In [85]:
# extract concept type from relation_type
label_dict = {"Tr":"treatment", "Te":"test", "P":"problem"}