# Concepts detection

In [1]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# change directory to project root
%cd ../..
%cd /mnt/d/Google Drive/projects/medical_txt_parser

import glob
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
from pprint import pprint

import matplotlib.pyplot as plt

from src.utils.parse_data import parse_ast, parse_concept, parse_relation

/mnt/d/Google Drive/projects/medical_txt_parser
/mnt/d/Google Drive/projects/medical_txt_parser


In [2]:
train_data_path = "data/train"
val_data_path = "data/val"
ast_folder_name = "ast"
concept_folder_name = "concept"
rel_folder_name = "rel"
txt_folder_name = "txt"

### Import data

In [3]:
text_files = glob.glob(train_data_path + os.sep + txt_folder_name + os.sep +  "*.txt")
filename = ""
df = pd.DataFrame()
for file in tqdm(text_files):
    with open(file, 'r') as f:
        text = f.read()
        filename = file.split("/")[-1].split(".")[0]
        concept = parse_concept(train_data_path + os.sep + concept_folder_name + os.sep +  filename + ".con")
        
        df = df.append(pd.DataFrame({"text": [text], "filename": [filename] , "concept": [concept]}), ignore_index=True)
df.head()

  0%|          | 0/170 [00:00<?, ?it/s]

Unnamed: 0,text,filename,concept
0,018636330 DH\n5425710\n123524\n0144918\n6/2/20...,018636330_DH,"{'concept_text': ['a workup', 'pain', 'microsc..."
1,026350193 RWH\n7093319\n549304\n8417371\n6/5/2...,026350193_RWH,"{'concept_text': ['flexeril', 'constipation', ..."
2,037945397 RWH\n2690633\n194867\n151887\n10/17/...,037945397_RWH,"{'concept_text': ['ivf', 'near syncope', 'recu..."
3,044687343 ELMVH\n01719921\n1626859\n3/13/2006 ...,044687343_ELMVH,"{'concept_text': ['lisinopril pump', 'bipap', ..."
4,060376519 DH\n0649031\n323495\n3838556\n4/5/20...,060376519_DH,"{'concept_text': ['dizziness', 'benign positio..."


In [4]:
concept_df = pd.DataFrame(columns=[ "filename"]+list(concept.keys()))
for i, file in df.iterrows():
    concept_dict = file["concept"]
    tmp = pd.DataFrame(concept_dict)
    tmp["filename"] = file["filename"]
    concept_df = concept_df.append(tmp, ignore_index=True)
concept_df.head()

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type
0,018636330_DH,a workup,27,2,27,3,test
1,018636330_DH,pain,55,10,55,10,problem
2,018636330_DH,microscopic anterior cervical diskectomy at c5-6,23,0,23,5,treatment
3,018636330_DH,hyperlipidemia,29,4,29,4,problem
4,018636330_DH,po pain medications,47,7,47,9,treatment


### Dataset Consolidation

In [6]:
# check start_line == end_line
concept_df[concept_df["start_line"] != concept_df["end_line"]]

Unnamed: 0,filename,concept_text,start_line,start_word_number,end_line,end_word_number,concept_type


In [5]:
# print a random text
consolidated_df = {}

for i, row in concept_df.iterrows():
    filename = row["filename"]
    text = df[df["filename"] == filename]["text"].values[0]
    line = text.split("\n")[row["start_line"]] # NOTE: we assume that start_line == end_line
    # find wo
    if filename in consolidated_df:
        consolidated_df[filename]["prob_indices_start"] = row
    else:
        consolidated_dataset[filename] = {
            "text": line,
            "problem" : [],
            "test": [],
            "treatment": [],
            # use sets because the indices can repeat for various reasons
            "drug_indices_start": set(row["indexes"]["drug"]["start_char"]),
            "drug_indices_end": set(row["indexes"]["drug"]["end_char"]),
            "problem_indices_start": set(),
            "problem_indices_end": set(),
            "test_indices_start": set(),
            "test_indices_end": set(),
            "treatment_indices_start": set(),
            "treatment_indices_end": set(),
        }
        if row["concept"] == "problem":
            consolidated_dataset[filename]["problem"].append(row["concept_text"])
            consolidated_dataset[filename]["problem_indices_start"].add(
