In [1]:
import re
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
def get_concepts_from_file(file_path,file_name,source):
    
    list_of_concepts = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|t="(.*)"'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            concept_dict = dict()
            concept_dict['source'] = source
            concept_dict['file_name'] = file_name 
            concept_dict['text'] = pattern_search.group(1)
            concept_dict['line_number'] = int(pattern_search.group(2))
            concept_dict['begin_word_num'] = int(pattern_search.group(3))
            concept_dict['end_word_num'] = int(pattern_search.group(4))
            concept_dict['concept_type'] = pattern_search.group(5)
            list_of_concepts.append(concept_dict)
    
    return list_of_concepts

In [3]:
data_file_path = os.path.dirname(os.getcwd()) + r'\Data\concept_assertion_relation_training_data' 
beth_file_path = data_file_path + r'\beth'
partners_file_path = data_file_path + r'\partners'

list_of_all_concepts = list()

for file in os.listdir(beth_file_path+r'\concept'):
    file_path = os.path.join(beth_file_path+r'\concept', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file.strip(".con"),'beth'))

for file in os.listdir(partners_file_path+r'\concept'):
    file_path = os.path.join(partners_file_path+r'\concept', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file.strip(".con"),'partners'))

In [4]:
concept_df = pd.DataFrame(list_of_all_concepts)

In [5]:
concept_df.head()

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,concept_type
0,beth,record-105,left basilar atelectasis,55,6,8,problem
1,beth,record-105,ventral hernia,143,1,2,problem
2,beth,record-105,htn,26,0,0,problem
3,beth,record-105,spontaneous echo contrast,68,1,3,problem
4,beth,record-105,cath,21,0,0,test


In [6]:
concept_df.groupby(['concept_type']).size()

concept_type
problem      7073
test         4608
treatment    4844
dtype: int64

In [7]:
def get_assertions_from_file(file_path,file_name,source):
    
    list_of_assertions = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|t=".*"\|\|a="(.*)"'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            assertion_dict = dict()
            assertion_dict['source'] = source
            assertion_dict['file_name'] = file_name 
            assertion_dict['text'] = pattern_search.group(1)
            assertion_dict['line_number'] = int(pattern_search.group(2))
            assertion_dict['begin_word_num'] = int(pattern_search.group(3))
            assertion_dict['end_word_num'] = int(pattern_search.group(4))
            assertion_dict['assertion_type'] = pattern_search.group(5)
            list_of_assertions.append(assertion_dict)
    
    return list_of_assertions

In [8]:
list_of_all_assertions = list()

for file in os.listdir(beth_file_path+r'\ast'):
    file_path = os.path.join(beth_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file,'beth'))

for file in os.listdir(partners_file_path+r'\ast'):
    file_path = os.path.join(partners_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file,'partners'))

In [9]:
assertion_df = pd.DataFrame(list_of_all_assertions)

In [10]:
assertion_df.head()

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,assertion_type
0,beth,record-105.ast,left basilar atelectasis,55,6,8,present
1,beth,record-105.ast,ventral hernia,143,1,2,present
2,beth,record-105.ast,htn,26,0,0,present
3,beth,record-105.ast,spontaneous echo contrast,68,1,3,absent
4,beth,record-105.ast,80% lm lesion,21,6,8,present


In [11]:
assertion_df.groupby(['assertion_type']).size()

assertion_type
absent                          1596
associated_with_someone_else      89
conditional                       73
hypothetical                     382
possible                         309
present                         4624
dtype: int64

In [12]:
def get_relations_from_file(file_path,file_name,source):
    
    list_of_relations = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|r="(.*)"\|\|c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            relation_dict = dict()
            relation_dict['source'] = source
            relation_dict['file_name'] = file_name 
            relation_dict['from_text'] = pattern_search.group(1)
            relation_dict['from_line_number'] = int(pattern_search.group(2))
            relation_dict['from_begin_word_num'] = int(pattern_search.group(3))
            relation_dict['from_end_word_num'] = int(pattern_search.group(4))
            relation_dict['relation_type'] = pattern_search.group(5)
            relation_dict['to_text'] = pattern_search.group(6)
            relation_dict['to_line_number'] = int(pattern_search.group(7))
            relation_dict['to_begin_word_num'] = int(pattern_search.group(8))
            relation_dict['to_end_word_num'] = int(pattern_search.group(9))
            list_of_relations.append(relation_dict)
    
    return list_of_relations

In [13]:
list_of_all_relations = list()

for file in os.listdir(beth_file_path+r'\rel'):
    file_path = os.path.join(beth_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file,'beth'))

for file in os.listdir(partners_file_path+r'\rel'):
    file_path = os.path.join(partners_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file,'partners'))

In [14]:
relation_df = pd.DataFrame(list_of_all_relations)

In [15]:
relation_df.head()

Unnamed: 0,source,file_name,from_text,from_line_number,from_begin_word_num,from_end_word_num,relation_type,to_text,to_line_number,to_begin_word_num,to_end_word_num
0,beth,record-105.rel,cath,21,0,0,TeRP,80% lm lesion,21,6,8
1,beth,record-105.rel,pefusion imaging,19,6,7,TeRP,perfusion defects,19,12,13
2,beth,record-105.rel,drugs,12,8,8,TrCP,known allergies,12,5,6
3,beth,record-105.rel,metal plate,26,7,8,TrAP,gsw,26,11,11
4,beth,record-105.rel,creams,145,14,14,TrNAP,any incisions,145,20,21


In [16]:
relation_df.groupby(['relation_type']).size()

relation_type
PIP      755
TeCP     166
TeRP     993
TrAP     885
TrCP     184
TrIP      51
TrNAP     62
TrWP      24
dtype: int64

In [None]:
text = r"C:/Users/itsma/Documents/CS 6120 Project/Data/concept_assertion_relation_training_data/beth/txt/record-105.txt"

oFile = open(text, 'r')
line = oFile.read()

In [None]:
line

In [None]:
line.split("\n")[25].split()

In [None]:
for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='beth')]
    
    for index,row in file_concepts.iterrows():
        words = all_lines[int(row['line_number'])-1].split()[int(row['begin_word_num']):int(row['end_word_num'])+1]
        if(" ".join(words).lower()!=row['text'].lower()):
            print(row)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [None]:
inputs

In [None]:
encodings = tokenizer.encode("Hello, my dog is cute",add_special_tokens = True)

In [None]:
encodings

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)
        
outputs = model(input_ids,token_type_ids=None)

In [None]:
len(outputs[0][0][5].data.numpy())

In [24]:
from BERT_utility import BERT_utility

In [21]:
from imp import reload

In [22]:
import BERT_utility

In [23]:
reload(BERT_utility)

<module 'BERT_utility' from 'C:\\Users\\itsma\\Documents\\CS 6120 Project\\CS6120\\Code\\BERT_utility.py'>

In [25]:
utility = BERT_utility()
#word_list = utility.process_string_finetune(line,0)

In [19]:
def create_pos_dict(concept):
    positions = dict()
    
    for index,row in concept.iterrows():
        for i in range(row['begin_word_num'],row['end_word_num']+1):
            positions[str(row['line_number'])+":"+str(i)] = row['concept_type']
    
    return positions

In [26]:
utility = BERT_utility()

all_words_list = list()
for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='beth')]
    positions = create_pos_dict(file_concepts)
    
    word_list = utility.process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in positions):
            entry.update({"concept":positions[key]})
        else:
            entry.update({"concept":"blank"})
    
    all_words_list.extend(word_list)

  0%|                                                                                           | 0/74 [00:00<?, ?it/s]

{'word': 'Admission', 'keyword_vector': array([ 8.359914 , -2.136409 , -3.1871119, -2.7117975], dtype=float32), 'sentence_index': 1, 'word_index': 0}
{'word': 'Date', 'keyword_vector': array([ 8.403783 , -2.1000547, -3.126127 , -2.758141 ], dtype=float32), 'sentence_index': 1, 'word_index': 1}
{'word': ':', 'keyword_vector': array([ 8.456588 , -2.1012568, -3.163484 , -2.7142665], dtype=float32), 'sentence_index': 1, 'word_index': 2}
{'word': '2017-06-13', 'keyword_vector': array([ 8.409215 , -2.4124577, -3.0627542, -2.6660285], dtype=float32), 'sentence_index': 2, 'word_index': 0}
{'word': 'Discharge', 'keyword_vector': array([ 8.428832 , -2.3091574, -2.9810236, -2.7452831], dtype=float32), 'sentence_index': 3, 'word_index': 0}
{'word': 'Date', 'keyword_vector': array([ 8.411113 , -2.1695185, -3.0106492, -2.7994552], dtype=float32), 'sentence_index': 3, 'word_index': 1}
{'word': ':', 'keyword_vector': array([ 8.47194  , -2.146113 , -3.0365047, -2.7956421], dtype=float32), 'sentence_ind

  3%|██▏                                                                                | 2/74 [00:05<03:34,  2.98s/it]

{'word': ')*', 'keyword_vector': array([ 8.321417 , -2.2840736, -3.1504157, -2.2205102], dtype=float32), 'sentence_index': 119, 'word_index': 8}
{'word': 'Refills', 'keyword_vector': array([ 8.358825 , -2.2521574, -2.7439053, -2.8066409], dtype=float32), 'sentence_index': 120, 'word_index': 0}
{'word': ':', 'keyword_vector': array([ 8.382341 , -2.2559013, -2.7592168, -2.8554552], dtype=float32), 'sentence_index': 120, 'word_index': 1}
{'word': '*0*', 'keyword_vector': array([ 8.424569 , -2.1600502, -2.954429 , -2.757305 ], dtype=float32), 'sentence_index': 120, 'word_index': 2}
{'word': '3.', 'keyword_vector': array([ 8.368449 , -2.5521855, -2.9694316, -2.3371568], dtype=float32), 'sentence_index': 121, 'word_index': 0}
{'word': 'Docusate', 'keyword_vector': array([-2.6455605, -2.3385496, -2.1828146,  7.8187394], dtype=float32), 'sentence_index': 121, 'word_index': 1}
{'word': 'Sodium', 'keyword_vector': array([-2.31407  , -2.4000537, -2.4623303,  7.761673 ], dtype=float32), 'sentence_

  3%|██▏                                                                                | 2/74 [00:10<06:26,  5.37s/it]


KeyboardInterrupt: 

In [None]:
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    positions = create_pos_dict(file_concepts)
    
    word_list = utility.process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in positions):
            entry.update({"concept":positions[key]})
        else:
            entry.update({"concept":"blank"})
    
    all_words_list.extend(word_list)

In [None]:
len(all_words_list)

In [None]:
words_df = pd.DataFrame(all_words_list)

In [None]:
set(list(words_df['concept']))

In [None]:
test_dict = {"a":"1", "b":"2"}

In [None]:
if("a" in test_dict):
    print(test_dict["a"])

In [None]:
X = np.vstack(list(words_df["keyword_vector"]))                                
y = words_df["concept"]  

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0,solver="lbfgs").fit(X, y)

In [None]:
clf.score(X,y)

In [None]:
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data' 

list_of_all_test_concepts = list()

for file in os.listdir(test_data_file_path+r'\concepts'):
    file_path = os.path.join(test_data_file_path+r'\concepts', file)
    list_of_all_test_concepts.extend(get_concepts_from_file(file_path,file.strip(".con"),'test_data'))

In [None]:
test_concept_df = pd.DataFrame(list_of_all_test_concepts)

In [None]:
test_concept_df.head()

In [None]:
all_words_list_test = list()
test_data_texts_path =  os.path.dirname(os.getcwd()) + r'\Data\test_data'
for file in tqdm(os.listdir(test_data_texts_path)):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(test_data_texts_path, file)
    
    file_name = file.strip(".txt")
    
    oFile = open(file_path, 'r')
    
    line = oFile.read()
    
    all_lines = line.split("\n")
    
    file_concepts = test_concept_df[(test_concept_df['file_name']==file_name)&(test_concept_df['source']=='test_data')]
    
    positions = create_pos_dict(file_concepts)
    
    word_list = process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in positions):
            entry.update({"concept":positions[key]})
        else:
            entry.update({"concept":"blank"})
    
    all_words_list_test.extend(word_list)

In [None]:
test_word_df = pd.DataFrame(all_words_list_test)

In [None]:
X_test = np.vstack(list(test_word_df["keyword_vector"]))                                
y_test = test_word_df["concept"]  

In [None]:
clf.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_true,y_predict,labels)

In [None]:
y_true = y_test
y_predict = clf.predict(X_test)

In [None]:
labels = ['blank', 'problem', 'test', 'treatment']

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_true, y_predict, average='macro')

In [None]:
class_map = {"blank":0,"problem":1,"test":2,"treatment":3}

In [None]:
encoding_list = list()
label_list = list()
utility = BERT_utility()

for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='beth')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

In [None]:
[0]*5

In [None]:
tokenizer.convert_ids_to_tokens([101, 2381, 1997, 2556, 7355, 1024, 102])

In [None]:
label_list[87]

In [None]:
encoding_list[87]

In [None]:
len(encoding_list)

In [None]:
import pickle

In [None]:
pickle.dump(encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/input_ids.pkl","wb"))
pickle.dump(label_list,open("C:/Users/itsma/Documents/CS 6120 Project/label.pkl","wb"))

In [None]:
finetuned_model = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/CS6120/Model/finetuned_model.pkl","rb"))

In [None]:
encodings = tokenizer.encode("Hello, my dog is cute",add_special_tokens = True)

In [None]:
finetuned_model.cpu()

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)
        
outputs = finetuned_model(input_ids,token_type_ids=None)

In [None]:
filt = [entry for entry in label_list if 3 in entry]