[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Mustapha-AJEGHRIR/medical_txt_parser/blob/main/src/notebooks/assertions_nli/ast_nli_scibert.ipynb)

# Prepare RE dataset
scibert: https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/train.txt

scifive: https://github.com/justinphan3110/SciFive

In [None]:
%%capture
!pip install seqeval transformers datasets spacy sentence_transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/projects/medical_txt_parser

In [4]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

path = %pwd
while "src" in path:
    %cd ..
    path = %pwd

import glob
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from pprint import pprint
import matplotlib.pyplot as plt
import re

import transformers
from datasets import Dataset, ClassLabel, Sequence, load_dataset, load_metric
from spacy import displacy

assert transformers.__version__ >= "4.11.0"

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

In [5]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"
re_data_path = "data/re"
os.makedirs(re_data_path, exist_ok=True)

### Import data

In [6]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        rel = parse_relation(train_data_path + os.sep + rel_folder_name + os.sep +  filename + ".rel")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept], "rel":[rel]}), ignore_index=True)
df.head()

100%|██████████| 170/170 [00:00<00:00, 463.57it/s]


Unnamed: 0,text,filename,concept,rel
0,143748600 SC\n43243309\n095342\n09/30/2000 12:...,143748600_SC,"{'concept_text': ['htn', 'atypical chest pain'...","{'concept_text_1': ['dm type ii', 'lead placem..."
1,Admission Date :\n2014-03-31\nDischarge Date :...,record-56,"{'concept_text': ['the procedure', 'auscultati...","{'concept_text_1': ['ultrasound', 'ultrasound'..."
2,699905656 SC\n06100044\n921737\n1/17/1993 12:0...,699905656_SC,{'concept_text': ['right sided sciatica proced...,"{'concept_text_1': ['ct myelogram', 'ct myelog..."
3,627258104\nGH\n56900479\n1/8/2001 12:00:00 AM\...,627258104,"{'concept_text': ['bradycardia', 'subsequent f...","{'concept_text_1': ['bradycardia', 'a chest xr..."
4,839999049 YC\n59242403\n431924\n11/01/2000 12:...,839999049_YC,"{'concept_text': ['pauses', 'outpt fitting', '...","{'concept_text_1': ['ett', 'desaturations in t..."


In [7]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,143748600_SC,htn,52,19,52,19,problem
1,143748600_SC,atypical chest pain,5,0,5,2,problem
2,143748600_SC,wellbutrin ( bupropion hcl ),33,19,33,23,treatment
3,143748600_SC,ett,58,3,58,3,test
4,143748600_SC,dissection,56,7,56,7,problem
...,...,...,...,...,...,...,...
16520,105732749,repeated fleet &apos;s enemas,53,4,53,7,treatment
16521,105732749,biopsy,23,2,23,2,test
16522,105732749,respirations,39,9,39,9,test
16523,105732749,these tests,61,11,61,12,test


In [8]:
# add concepts
all_rel_df = pd.DataFrame()
for fname in tqdm(df["filename"].unique()):
    concept_dict = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep + fname + ".con")
    
    concept_df = pd.DataFrame(concept_dict).drop(columns=["end_line"])
    test_concept_df = concept_df[concept_df["concept_type"] == "test"]
    problem_concept_df = concept_df[concept_df["concept_type"] == "problem"]
    treatment_concept_df = concept_df[concept_df["concept_type"] == "treatment"]

    # class test --> problem
    test_problem_df = pd.merge(test_concept_df, problem_concept_df, how="inner", on="start_line", suffixes=("_1", "_2"))

    # class treatment --> problem
    treatment_problem_df = pd.merge(treatment_concept_df, problem_concept_df, how="inner", on="start_line", suffixes=("_1", "_2"))

    # class problem --> problem
    problem_problem_df = pd.merge(problem_concept_df, problem_concept_df, how="inner", on="start_line", suffixes=("_1", "_2"))
    problem_problem_df = problem_problem_df[problem_problem_df["concept_text_1"] != problem_problem_df["concept_text_2"]] # TODO: remove duplicates ?

    tmp = pd.concat([test_problem_df, treatment_problem_df, problem_problem_df], axis=0)
    tmp["filename"] = fname
    all_rel_df = all_rel_df.append(tmp, ignore_index=True)
            
all_rel_df = all_rel_df.sort_values(by=["filename", "start_line"])
all_rel_df = all_rel_df.reset_index(drop=True)

all_rel_df

100%|██████████| 170/170 [00:01<00:00, 90.12it/s]


Unnamed: 0,concept_text_1,start_line,start_word_number_1,end_word_number_1,concept_type_1,concept_text_2,start_word_number_2,end_word_number_2,concept_type_2,filename
0,c5-6 disc herniation,21.0,0.0,2.0,problem,cord compression,4.0,5.0,problem,018636330_DH
1,c5-6 disc herniation,21.0,0.0,2.0,problem,myelopathy,7.0,7.0,problem,018636330_DH
2,cord compression,21.0,4.0,5.0,problem,c5-6 disc herniation,0.0,2.0,problem,018636330_DH
3,cord compression,21.0,4.0,5.0,problem,myelopathy,7.0,7.0,problem,018636330_DH
4,myelopathy,21.0,7.0,7.0,problem,c5-6 disc herniation,0.0,2.0,problem,018636330_DH
...,...,...,...,...,...,...,...,...,...,...
15338,pressure,99.0,23.0,23.0,problem,shortness of breath,25.0,27.0,problem,record-84
15339,shortness of breath,99.0,25.0,27.0,problem,chest pain,20.0,21.0,problem,record-84
15340,shortness of breath,99.0,25.0,27.0,problem,pressure,23.0,23.0,problem,record-84
15341,a transdermal nicotine patch,100.0,34.0,37.0,treatment,cravings,49.0,49.0,problem,record-84


In [9]:
rel_df = pd.DataFrame(columns=[ "filename"]+list(rel.keys()))
for i, file in df.iterrows():
    rel_dict = file["rel"]
    tmp = pd.DataFrame(rel_dict)
    tmp["filename"] = file["filename"]
    rel_df = rel_df.append(tmp, ignore_index=True)
rel_df.drop(columns=["end_line_1", "start_line_2", "end_line_2"], inplace=True)
rel_df.rename(columns={"start_line_1": "start_line"}, inplace=True)
rel_df

Unnamed: 0,filename,concept_text_1,start_line,start_word_number_1,end_word_number_1,concept_text_2,start_word_number_2,end_word_number_2,relation_type
0,143748600_SC,dm type ii,52,6,8,crfs,4,4,PIP
1,143748600_SC,lead placement,55,20,21,pain,10,10,TeRP
2,143748600_SC,t wave flattening laterally and inferiorly,55,3,8,pain,10,10,PIP
3,143748600_SC,chest ct,56,1,2,back pain,11,12,TeCP
4,143748600_SC,chest ct,56,1,2,dissection,7,7,TeCP
...,...,...,...,...,...,...,...,...,...
3115,105732749,a ct scan,27,18,20,a mass,24,25,TeRP
3116,105732749,percocet,58,0,0,pain,12,12,TrAP
3117,105732749,specimens,48,0,0,falciform nodules,14,15,TeCP
3118,105732749,the procedure,49,9,10,complications,3,3,TrCP


In [10]:
# merge rel_df and all_rel_df
rel_df = pd.merge(all_rel_df, rel_df, how="left", on=["filename", "start_line", "start_word_number_1", "end_word_number_1", "start_word_number_2", "end_word_number_2", "concept_text_1", "concept_text_2"], suffixes=("_1", "_2"))

# set NaN to other
rel_df.fillna("Other", inplace=True)
# change float to int cols
for col in ["start_line", "start_word_number_1", "end_word_number_1", "start_word_number_2", "end_word_number_2"]:
    rel_df[col] = rel_df[col].astype(int)
rel_df

Unnamed: 0,concept_text_1,start_line,start_word_number_1,end_word_number_1,concept_type_1,concept_text_2,start_word_number_2,end_word_number_2,concept_type_2,filename,relation_type
0,c5-6 disc herniation,21,0,2,problem,cord compression,4,5,problem,018636330_DH,PIP
1,c5-6 disc herniation,21,0,2,problem,myelopathy,7,7,problem,018636330_DH,PIP
2,cord compression,21,4,5,problem,c5-6 disc herniation,0,2,problem,018636330_DH,Other
3,cord compression,21,4,5,problem,myelopathy,7,7,problem,018636330_DH,Other
4,myelopathy,21,7,7,problem,c5-6 disc herniation,0,2,problem,018636330_DH,Other
...,...,...,...,...,...,...,...,...,...,...,...
15338,pressure,99,23,23,problem,shortness of breath,25,27,problem,record-84,Other
15339,shortness of breath,99,25,27,problem,chest pain,20,21,problem,record-84,Other
15340,shortness of breath,99,25,27,problem,pressure,23,23,problem,record-84,Other
15341,a transdermal nicotine patch,100,34,37,treatment,cravings,49,49,problem,record-84,TrAP


In [11]:
rel_df["relation_type"].value_counts()

Other    12225
TeRP       992
TrAP       884
PIP        755
TrCP       184
TeCP       166
TrNAP       62
TrIP        51
TrWP        24
Name: relation_type, dtype: int64

some relations don't exist in concept files. (Will ignore those)

In [59]:
# find difference between rel_df and new_rel_df
# difference_df = pd.merge(new_rel_df, rel_df, how="right", on=["filename", "start_line", "start_word_number_1", "end_word_number_1", "start_word_number_2", "end_word_number_2", "concept_text_1", "concept_text_2"], suffixes=("_1", "_2"))
# difference_df[difference_df.isnull().any(axis=1)]

Unnamed: 0,concept_text_1,start_line,start_word_number_1,end_word_number_1,concept_type_1,concept_text_2,start_word_number_2,end_word_number_2,concept_type_2,filename,relation_type_1,relation_type_2
1390,asa,74,0,0,,tricyclic,15,15,,record-177,,TeRP
1879,percocet,85,5,5,,pain relief,8,9,,record-18,,TrAP


In [13]:
def preprocess_text(row):
    text = row["text"]
    # find line
    # text = text.lower()
    line = text.split("\n")[row["start_line"]-1]
    line = " ".join(line.split()) # remove multiple spaces

    concept_text_1 = "<< "+ " ".join(line.split()[row["start_word_number_1"]:row["end_word_number_1"]+1]) + " >>"
    concept_text_2 = "[[ " + " ".join(line.split()[row["start_word_number_2"]:row["end_word_number_2"]+1]) + " ]]"
    start_word_number_1 = row["start_word_number_1"]
    end_word_number_1 = row["end_word_number_1"]
    start_word_number_2 = row["start_word_number_2"]
    end_word_number_2 = row["end_word_number_2"]

    if row["start_word_number_1"] > row["start_word_number_2"]:
        concept_text_1, concept_text_2 = concept_text_2, concept_text_1
        start_word_number_1, start_word_number_2 = start_word_number_2, start_word_number_1
        end_word_number_1, end_word_number_2 = end_word_number_2, end_word_number_1
    text = " ".join(line.split()[: start_word_number_1] + [concept_text_1] + line.split()[end_word_number_1+1: start_word_number_2] + [concept_text_2] + line.split()[end_word_number_2+1:])

    row["text"] = text
    return row

rel_df = rel_df.merge(df[["filename","text"]], on="filename", how="inner")
rel_df = rel_df.apply(preprocess_text, axis=1)
rel_df = rel_df[["filename","start_line", "text", "concept_text_1", "concept_text_2", "relation_type", "concept_type_1", "concept_type_2"]]
rel_df.columns = ["filename","line_num", "text", "concept_text_1", "concept_text_2", "rel_type", "concept_type_1", "concept_type_2"]
rel_df

Unnamed: 0,filename,line_num,text,concept_text_1,concept_text_2,rel_type,concept_type_1,concept_type_2
0,018636330_DH,21,<< C5-6 disc herniation >> with [[ cord compre...,c5-6 disc herniation,cord compression,PIP,problem,problem
1,018636330_DH,21,<< C5-6 disc herniation >> with cord compressi...,c5-6 disc herniation,myelopathy,PIP,problem,problem
2,018636330_DH,21,[[ C5-6 disc herniation ]] with << cord compre...,cord compression,c5-6 disc herniation,Other,problem,problem
3,018636330_DH,21,C5-6 disc herniation with << cord compression ...,cord compression,myelopathy,Other,problem,problem
4,018636330_DH,21,[[ C5-6 disc herniation ]] with cord compressi...,myelopathy,c5-6 disc herniation,Other,problem,problem
...,...,...,...,...,...,...,...,...
15338,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",pressure,shortness of breath,Other,problem,problem
15339,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",shortness of breath,chest pain,Other,problem,problem
15340,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",shortness of breath,pressure,Other,problem,problem
15341,record-84,100,The patient was told he could return to work a...,a transdermal nicotine patch,cravings,TrAP,treatment,problem


In [14]:
#export all
rel_df[["text", "rel_type"]].to_csv(re_data_path + os.sep + "re_scibert_data.tsv", sep="\t", index=False, header=False)
rel_df["rel_type"].value_counts()

Other    12225
TeRP       992
TrAP       884
PIP        755
TrCP       184
TeCP       166
TrNAP       62
TrIP        51
TrWP        24
Name: rel_type, dtype: int64

In [88]:
# class test --> problem
test_problem_df = rel_df[(rel_df["concept_type_1"] == "test") & (rel_df["concept_type_2"] == "problem")]
test_problem_df

Unnamed: 0,filename,line_num,text,concept_text_1,concept_text_2,rel_type,concept_type_1,concept_type_2
8,018636330_DH,27,She had << a workup >> by her neurologist and ...,a workup,cord compression,Other,test,problem
9,018636330_DH,27,She had << a workup >> by her neurologist and ...,a workup,a c5-6 disc herniation,Other,test,problem
10,018636330_DH,27,She had << a workup >> by her neurologist and ...,a workup,a t2 signal change,Other,test,problem
11,018636330_DH,27,She had a workup by her neurologist and << an ...,an mri,cord compression,TeRP,test,problem
12,018636330_DH,27,She had a workup by her neurologist and << an ...,an mri,a c5-6 disc herniation,TeRP,test,problem
...,...,...,...,...,...,...,...,...
15211,record-84,70,The patient had << an echocardiogram >> on day...,an echocardiogram,mild symmetric lvh,TeRP,test,problem
15212,record-84,70,The patient had << an echocardiogram >> on day...,an echocardiogram,mild region lv systolic dysfunction,TeRP,test,problem
15213,record-84,70,The patient had << an echocardiogram >> on day...,an echocardiogram,a moderately dilated aortic root,TeRP,test,problem
15214,record-84,70,The patient had << an echocardiogram >> on day...,an echocardiogram,a mildly dilated ascending aorta,TeRP,test,problem


In [89]:
# export as tsv
test_problem_df = test_problem_df[["text", "rel_type"]]
test_problem_df.to_csv(re_data_path + os.sep + "re_scibert_data_Te_P.tsv", sep="\t", index=False, header=False)
test_problem_df["rel_type"].value_counts()

Other    992
TeRP     992
TeCP     166
Name: rel_type, dtype: int64

In [84]:
# class treatment --> problem
treatment_problem_df = rel_df[(rel_df["concept_type_1"] == "treatment") & (rel_df["concept_type_2"] == "problem")]
treatment_problem_df

Unnamed: 0,filename,line_num,text,concept_text_1,concept_text_2,rel_type,concept_type_1,concept_type_2
27,018636330_DH,43,She had a postoperative CT scan that revealed ...,her hardware,partial decompression of the spinal canal,Other,treatment,problem
29,018636330_DH,47,[[ Her pain ]] was under good control with << ...,po pain medications,her pain,TrIP,treatment,problem
50,018636330_DH,55,"3. << Percocet >> , 5/325 , 1-2 tabs PO q4-6h ...",percocet,pain,TrAP,treatment,problem
51,026350193_RWH,45,Take << codeine >> prescribed by PCP with food...,codeine,constipation,Other,treatment,problem
52,026350193_RWH,45,Take << codeine >> prescribed by PCP with food...,codeine,nausea,Other,treatment,problem
...,...,...,...,...,...,...,...,...
15332,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",hydrochlorothiazide,chest pain,Other,treatment,problem
15333,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",hydrochlorothiazide,pressure,Other,treatment,problem
15334,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",hydrochlorothiazide,shortness of breath,Other,treatment,problem
15341,record-84,100,The patient was told he could return to work a...,a transdermal nicotine patch,cravings,TrAP,treatment,problem


In [85]:
# export as tsv
treatment_problem_df = treatment_problem_df[["text", "rel_type"]]
treatment_problem_df.to_csv(re_data_path + os.sep + "re_scibert_data_Tr_P.tsv", sep="\t", index=False, header=False)
treatment_problem_df["rel_type"].value_counts()

Other    1704
TrAP      884
TrCP      184
TrNAP      62
TrIP       51
TrWP       24
Name: rel_type, dtype: int64

In [86]:
# class problem --> problem
problem_problem_df = rel_df[(rel_df["concept_type_1"] == "problem") & (rel_df["concept_type_2"] == "problem")]
problem_problem_df

Unnamed: 0,filename,line_num,text,concept_text_1,concept_text_2,rel_type,concept_type_1,concept_type_2
0,018636330_DH,21,<< C5-6 disc herniation >> with [[ cord compre...,c5-6 disc herniation,cord compression,PIP,problem,problem
1,018636330_DH,21,<< C5-6 disc herniation >> with cord compressi...,c5-6 disc herniation,myelopathy,PIP,problem,problem
2,018636330_DH,21,[[ C5-6 disc herniation ]] with << cord compre...,cord compression,c5-6 disc herniation,Other,problem,problem
3,018636330_DH,21,C5-6 disc herniation with << cord compression ...,cord compression,myelopathy,Other,problem,problem
4,018636330_DH,21,[[ C5-6 disc herniation ]] with cord compressi...,myelopathy,c5-6 disc herniation,Other,problem,problem
...,...,...,...,...,...,...,...,...
15336,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",chest pain,shortness of breath,Other,problem,problem
15337,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",pressure,chest pain,Other,problem,problem
15338,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",pressure,shortness of breath,Other,problem,problem
15339,record-84,99,"Aspirin 325 mg q.d. , Plavix 75 mg q.d. , Lipi...",shortness of breath,chest pain,Other,problem,problem


In [87]:
# export as tsv
problem_problem_df = problem_problem_df[["text", "rel_type"]]
problem_problem_df.to_csv(re_data_path + os.sep + "re_scibert_data_P_P.tsv", sep="\t", index=False, header=False)
problem_problem_df["rel_type"].value_counts()

Other    9529
PIP       755
Name: rel_type, dtype: int64

## Old

In [183]:
# export as tsv
rel_df.to_csv(re_data_path + os.sep + "re_data_scibert.tsv", sep="\t", index=False, header=False)

In [184]:
pd.read_csv(re_data_path + os.sep + "re_data_scibert.tsv", nrows=10, sep='\t', header=None)

Unnamed: 0,0,1
0,41 yo man with [[ CRFs ]] of << DM Type II >> ...,PIP
1,"Here , had T wave flattening laterally and inf...",TeRP
2,"Here , had << T wave flattening laterally and ...",PIP
3,Had << Chest CT >> to r / o dissection ( due t...,TeCP
4,Had << Chest CT >> to r / o [[ dissection ]] (...,TeCP
5,<< Ultrasound >> was performed on 2014-04-01 d...,TeRP
6,<< Ultrasound >> was performed on 2014-04-01 d...,TeRP
7,Ultrasound was performed on 2014-04-01 demonst...,PIP
8,Ultrasound was performed on 2014-04-01 demonst...,TrAP
9,Ultrasound was performed on 2014-04-01 demonst...,TrAP


In [19]:
#export as jsonl
rel_df.to_json(re_data_path + os.sep + "re_data.jsonl", orient="records", lines=True)

In [20]:
# read 10 first lines pandas json
pd.read_json(re_data_path + os.sep + "re_data.jsonl", lines=True, nrows=10)

Unnamed: 0,text,rel_type
0,41 yo man with ## CRFs ## of ** DM Type II ** ...,PIP
1,"Here , had T wave flattening laterally and inf...",TeRP
2,"Here , had ** T wave flattening laterally and ...",PIP
3,Had ** Chest CT ** to r / o dissection ( due t...,TeCP
4,Had ** Chest CT ** to r / o ## dissection ## (...,TeCP
5,** Ultrasound ** was performed on 2014-04-01 d...,TeRP
6,** Ultrasound ** was performed on 2014-04-01 d...,TeRP
7,Ultrasound was performed on 2014-04-01 demonst...,PIP
8,Ultrasound was performed on 2014-04-01 demonst...,TrAP
9,Ultrasound was performed on 2014-04-01 demonst...,TrAP


In [None]:
train_df[1].value_counts()

TeRP     993
TrAP     884
PIP      754
TrCP     184
TeCP     166
TrNAP     62
TrIP      51
TrWP      24
Name: 1, dtype: int64