[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Prepare RE dataset
scibert: https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/train.txt

scifive: https://github.com/justinphan3110/SciFive

In [None]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

In [165]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import re

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [166]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
re_data_path = "data/re"
os.makedirs(re_data_path, exist_ok=True)

### Import data

In [167]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        rel = parse_relation(train_data_path + os.sep + rel_folder_name + os.sep +  filename + ".rel")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept], "rel":[rel]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:00<00:00, 392.26it/s]


Unnamed: 0,text,filename,concept,rel
0,143748600 SC\n43243309\n095342\n09/30/2000 12:...,143748600_SC,"{'concept_text': ['htn', 'atypical chest pain'...","{'concept_text_1': ['dm type ii', 'lead placem..."
1,Admission Date :\n2014-03-31\nDischarge Date :...,record-56,"{'concept_text': ['the procedure', 'auscultati...","{'concept_text_1': ['ultrasound', 'ultrasound'..."
2,699905656 SC\n06100044\n921737\n1/17/1993 12:0...,699905656_SC,{'concept_text': ['right sided sciatica proced...,"{'concept_text_1': ['ct myelogram', 'ct myelog..."
3,627258104\nGH\n56900479\n1/8/2001 12:00:00 AM\...,627258104,"{'concept_text': ['bradycardia', 'subsequent f...","{'concept_text_1': ['bradycardia', 'a chest xr..."
4,839999049 YC\n59242403\n431924\n11/01/2000 12:...,839999049_YC,"{'concept_text': ['pauses', 'outpt fitting', '...","{'concept_text_1': ['ett', 'desaturations in t..."


In [168]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,143748600_SC,htn,52,19,52,19,problem
1,143748600_SC,atypical chest pain,5,0,5,2,problem
2,143748600_SC,wellbutrin ( bupropion hcl ),33,19,33,23,treatment
3,143748600_SC,ett,58,3,58,3,test
4,143748600_SC,dissection,56,7,56,7,problem
...,...,...,...,...,...,...,...
16520,105732749,repeated fleet &apos;s enemas,53,4,53,7,treatment
16521,105732749,biopsy,23,2,23,2,test
16522,105732749,respirations,39,9,39,9,test
16523,105732749,these tests,61,11,61,12,test


In [176]:
rel_df = pd.DataFrame(columns=[ "filename"]+list(rel.keys()))
for i, file in df.iterrows():
    rel_dict = file["rel"]
    tmp = pd.DataFrame(rel_dict)
    tmp["filename"] = file["filename"]
    rel_df = rel_df.append(tmp, ignore_index=True)
rel_df

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type
0,143748600_SC,dm type ii,52,6,52,8,crfs,52,4,52,4,PIP
1,143748600_SC,lead placement,55,20,55,21,pain,55,10,55,10,TeRP
2,143748600_SC,t wave flattening laterally and inferiorly,55,3,55,8,pain,55,10,55,10,PIP
3,143748600_SC,chest ct,56,1,56,2,back pain,56,11,56,12,TeCP
4,143748600_SC,chest ct,56,1,56,2,dissection,56,7,56,7,TeCP
...,...,...,...,...,...,...,...,...,...,...,...,...
3115,105732749,a ct scan,27,18,27,20,a mass,27,24,27,25,TeRP
3116,105732749,percocet,58,0,58,0,pain,58,12,58,12,TrAP
3117,105732749,specimens,48,0,48,0,falciform nodules,48,14,48,15,TeCP
3118,105732749,the procedure,49,9,49,10,complications,49,3,49,3,TrCP


In [177]:
# extract relations from column "relation_type"
label_dict = {"Tr":"treatment", "Te":"test", "PIP":"problem"}
rel_df["rel_type1"] = rel_df["relation_type"].apply(lambda x: re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', x)[0]).apply(lambda x: label_dict[x])
rel_df["rel_type2"] = "problem"

# extract realtions from concept files
merge_df = pd.merge(concept_df, rel_df, left_on=["filename","start_line","start_word_number","end_line","end_word_number"], right_on=["filename","start_line_1","start_word_number_1","end_line_1","end_word_number_1"], how="inner")
assert len(merge_df[merge_df["concept_text"] != merge_df["concept_text_1"]]) == 0, "concept_text is not equal to concept_text_1"
rel_df["concept_type_1"] = merge_df["concept_type"]

merge_df = pd.merge(concept_df, rel_df, left_on=["filename","start_line","start_word_number","end_line","end_word_number"], right_on=["filename","start_line_2","start_word_number_2","end_line_2","end_word_number_2"], how="inner")
assert len(merge_df[merge_df["concept_text"] != merge_df["concept_text_2"]]) == 0, "concept_text is not equal to concept_text_2"
rel_df["concept_type_2"] = merge_df["concept_type"]

rel_df

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2
0,143748600_SC,dm type ii,52,6,52,8,crfs,52,4,52,4,PIP,problem,problem,problem,problem
1,143748600_SC,lead placement,55,20,55,21,pain,55,10,55,10,TeRP,test,problem,test,problem
2,143748600_SC,t wave flattening laterally and inferiorly,55,3,55,8,pain,55,10,55,10,PIP,problem,problem,problem,problem
3,143748600_SC,chest ct,56,1,56,2,back pain,56,11,56,12,TeCP,test,problem,test,problem
4,143748600_SC,chest ct,56,1,56,2,dissection,56,7,56,7,TeCP,test,problem,test,problem
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,105732749,a ct scan,27,18,27,20,a mass,27,24,27,25,TeRP,test,problem,test,problem
3116,105732749,percocet,58,0,58,0,pain,58,12,58,12,TrAP,treatment,problem,treatment,problem
3117,105732749,specimens,48,0,48,0,falciform nodules,48,14,48,15,TeCP,test,problem,test,problem
3118,105732749,the procedure,49,9,49,10,complications,49,3,49,3,TrCP,treatment,problem,treatment,problem


Dissimilarities between concept files and relation files

In [178]:
rel_df[rel_df["concept_type_1"] != rel_df["rel_type1"]]

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2


In [179]:
rel_df[rel_df["concept_type_2"] != rel_df["rel_type2"]]

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2
1392,record-177,levofloxacin,122,18,122,18,uti,122,9,122,9,TrAP,treatment,problem,treatment,test
1874,record-18,anemia,104,17,104,17,high mcv,104,6,104,7,PIP,problem,problem,problem,treatment


In [180]:
# check same start line
rel_df[rel_df["start_line_2"] != rel_df["start_line_1"]]

Unnamed: 0,filename,concept_text_1,start_line_1,start_word_number_1,end_line_1,end_word_number_1,concept_text_2,start_line_2,start_word_number_2,end_line_2,end_word_number_2,relation_type,rel_type1,rel_type2,concept_type_1,concept_type_2


In [181]:
# we drop dissimilarities
rel_df.drop(rel_df[rel_df["concept_type_1"] != rel_df["rel_type1"]].index, inplace=True)
rel_df.drop(rel_df[rel_df["concept_type_2"] != rel_df["rel_type2"]].index, inplace=True)


def preprocess_text(row):
    text = row["text"]
    # find line
    # text = text.lower()
    line = text.split("\n")[row["start_line_1"]-1]
    line = " ".join(line.split()) # remove multiple spaces

    concept_text_1 = "<< "+ " ".join(line.split()[row["start_word_number_1"]:row["end_word_number_1"]+1]) + " >>"
    concept_text_2 = "[[ " + " ".join(line.split()[row["start_word_number_2"]:row["end_word_number_2"]+1]) + " ]]"
    start_word_number_1 = row["start_word_number_1"]
    end_word_number_1 = row["end_word_number_1"]
    start_word_number_2 = row["start_word_number_2"]
    end_word_number_2 = row["end_word_number_2"]

    if row["start_word_number_1"] > row["start_word_number_2"]:
        concept_text_1, concept_text_2 = concept_text_2, concept_text_1
        start_word_number_1, start_word_number_2 = start_word_number_2, start_word_number_1
        end_word_number_1, end_word_number_2 = end_word_number_2, end_word_number_1
    text = " ".join(line.split()[: start_word_number_1] + [concept_text_1] + line.split()[end_word_number_1+1: start_word_number_2] + [concept_text_2] + line.split()[end_word_number_2+1:])

    row["text"] = text
    return row

rel_df = rel_df.merge(df[["filename","text"]], on="filename", how="inner")
rel_df = rel_df.apply(preprocess_text, axis=1)
rel_df = rel_df[["filename","start_line_1", "text", "concept_text_1", "concept_text_2", "relation_type", "concept_type_1", "concept_type_2"]]
rel_df.columns = ["filename","line_num", "text", "concept_text_1", "concept_text_2", "rel_type", "concept_type_1", "concept_type_2"]
rel_df

Unnamed: 0,filename,line_num,text,concept_text_1,concept_text_2,rel_type,concept_type_1,concept_type_2
0,143748600_SC,52,41 yo man with [[ CRFs ]] of << DM Type II >> ...,dm type ii,crfs,PIP,problem,problem
1,143748600_SC,55,"Here , had T wave flattening laterally and inf...",lead placement,pain,TeRP,test,problem
2,143748600_SC,55,"Here , had << T wave flattening laterally and ...",t wave flattening laterally and inferiorly,pain,PIP,problem,problem
3,143748600_SC,56,Had << Chest CT >> to r / o dissection ( due t...,chest ct,back pain,TeCP,test,problem
4,143748600_SC,56,Had << Chest CT >> to r / o [[ dissection ]] (...,chest ct,dissection,TeCP,test,problem
...,...,...,...,...,...,...,...,...
3113,105732749,27,"She presented to her primary care physician , ...",a ct scan,a mass,TeRP,test,problem
3114,105732749,58,<< Percocet >> one to two whenever necessary q...,percocet,pain,TrAP,treatment,problem
3115,105732749,48,<< Specimens >> sent to pathology included per...,specimens,falciform nodules,TeCP,test,problem
3116,105732749,49,"There were no [[ complications ]] , and the pa...",the procedure,complications,TrCP,treatment,problem


In [182]:
rel_df = rel_df[["text", "rel_type"]]
rel_df

Unnamed: 0,text,rel_type
0,41 yo man with [[ CRFs ]] of << DM Type II >> ...,PIP
1,"Here , had T wave flattening laterally and inf...",TeRP
2,"Here , had << T wave flattening laterally and ...",PIP
3,Had << Chest CT >> to r / o dissection ( due t...,TeCP
4,Had << Chest CT >> to r / o [[ dissection ]] (...,TeCP
...,...,...
3113,"She presented to her primary care physician , ...",TeRP
3114,<< Percocet >> one to two whenever necessary q...,TrAP
3115,<< Specimens >> sent to pathology included per...,TeCP
3116,"There were no [[ complications ]] , and the pa...",TrCP


In [183]:
# export as tsv
rel_df.to_csv(re_data_path + os.sep + "re_data_scibert.tsv", sep="\t", index=False, header=False)

In [184]:
pd.read_csv(re_data_path + os.sep + "re_data_scibert.tsv", nrows=10, sep='\t', header=None)

Unnamed: 0,0,1
0,41 yo man with [[ CRFs ]] of << DM Type II >> ...,PIP
1,"Here , had T wave flattening laterally and inf...",TeRP
2,"Here , had << T wave flattening laterally and ...",PIP
3,Had << Chest CT >> to r / o dissection ( due t...,TeCP
4,Had << Chest CT >> to r / o [[ dissection ]] (...,TeCP
5,<< Ultrasound >> was performed on 2014-04-01 d...,TeRP
6,<< Ultrasound >> was performed on 2014-04-01 d...,TeRP
7,Ultrasound was performed on 2014-04-01 demonst...,PIP
8,Ultrasound was performed on 2014-04-01 demonst...,TrAP
9,Ultrasound was performed on 2014-04-01 demonst...,TrAP


In [19]:
#export as jsonl
rel_df.to_json(re_data_path + os.sep + "re_data.jsonl", orient="records", lines=True)

In [20]:
# read 10 first lines pandas json
pd.read_json(re_data_path + os.sep + "re_data.jsonl", lines=True, nrows=10)

Unnamed: 0,text,rel_type
0,41 yo man with ## CRFs ## of ** DM Type II ** ...,PIP
1,"Here , had T wave flattening laterally and inf...",TeRP
2,"Here , had ** T wave flattening laterally and ...",PIP
3,Had ** Chest CT ** to r / o dissection ( due t...,TeCP
4,Had ** Chest CT ** to r / o ## dissection ## (...,TeCP
5,** Ultrasound ** was performed on 2014-04-01 d...,TeRP
6,** Ultrasound ** was performed on 2014-04-01 d...,TeRP
7,Ultrasound was performed on 2014-04-01 demonst...,PIP
8,Ultrasound was performed on 2014-04-01 demonst...,TrAP
9,Ultrasound was performed on 2014-04-01 demonst...,TrAP


In [None]:
train_df[1].value_counts()

TeRP     993
TrAP     884
PIP      754
TrCP     184
TeCP     166
TrNAP     62
TrIP      51
TrWP      24
Name: 1, dtype: int64