# n2c2 EDA
### 2006 & 2014 Deidentification Challenge  
>* 2014 data contains one xml file with many records.  Each record contains PHI labels embedded within the text.
>* 2006 data contains separate xml files for each record.  Each record contains separate text and labels sections.

In [1]:
import os
from bs4 import BeautifulSoup as bs
import lxml
import re
from tqdm.notebook import tqdm

### 2014

In [2]:
proj_dir = os.path.join(os.path.expanduser('~'), 'redact')
train_data_path = os.path.join(
    proj_dir,
    'data/n2c2_train/deid_surrogate_train_all_version2.xml'
)
test_data_path = os.path.join(
    proj_dir,
    'data/n2c2_test/deid_surrogate_test_all_version2.xml'
)
train_output_dir = os.path.join(
    proj_dir,
    'data/n2c2_processed/2014/train/'
)
test_output_dir = os.path.join(
    proj_dir,
    'data/n2c2_processed/2014/test/'
)

##### Train

In [3]:
with open(train_data_path, 'r') as file:
    content = file.readlines()
    content = ''.join(content)
    bs_content = bs(content, 'lxml')

In [4]:
records = bs_content.find_all('record')

Parse one record

In [5]:
record = records[5]
text = record.find('text')
text_w_labels = str(text)
text_wout_labels = str(text.text)
phis = text.find_all('phi')

In [6]:
start_idx = 0
end_idx = len(text_w_labels)
phi_array = []
for phi in phis:
    phi_type = phi['type']
#     phi_type_len = len(phi_type)
    phi_text = phi.text
    label_prefix = 'B-'
    for phi_text_token in phi_text.split():
        phi_text_len = len(phi_text_token)
        phi_start = str(text).find(phi_text_token, start_idx, end_idx)
        phi_end = phi_start + phi_text_len
        if (phi_type == 'DATE') & bool(re.match(r'(0[1-9]|1[0-2])\/(0[1-9]|1\d|2\d|3[01])', phi_text_token)):
            if bool(re.match(r'\/\d\d\d\d', str(text)[phi_end+6: phi_end+11])):
                phi_text_token = phi_text_token + str(text)[phi_end+6: phi_end+11]
                phi_text_len += 5
                phi_end += 11
            elif bool(re.match(r'\/\d\d', str(text)[phi_end+6: phi_end+9])):
                phi_text_token = phi_text_token + str(text)[phi_end+6: phi_end+9]
                phi_text_len += 3
                phi_end += 9
        start_idx = phi_end
        phi_array.append([label_prefix + phi_type, phi_text_token])
        label_prefix = 'I-'
record_out = ''
label_out = ''
phi_idx = 0
for token in text_wout_labels.split():
    record_out += token + ' '
    if phi_idx == len(phi_array):
        label_out += 'O '
    elif token == phi_array[phi_idx][1]:
        label_out += phi_array[phi_idx][0] + ' '
        phi_idx += 1
    else:
        label_out += 'O '

In [7]:
assert(len(record_out.split()) == len(label_out.split()))

In [8]:
record_out

"220792313 HLGMC 1885323 085995 12/13/1998 12:00:00 AM FAILED LEFT TOTAL HIP REPLACEMENT . Unsigned DIS Report Status : Unsigned DISCHARGE SUMMARY NAME : LYSSFUST , NY UNIT NUMBER : 096-50-15 ADMISSION DATE : 12/13/98 DISCHARGE DATE : 12/17/98 PRINCIPAL DIAGNOSIS : Failed left total hip replacement . PRINCIPAL PROCEDURE : Left total hip replacement revision . ATTENDING PHYSICIAN : Li Rhalttland , M.D. HISTORY OF PRESENT ILLNESS : This is a 66-year-old Caucasian woman with a history of recurrent left hip dislocations . She 's had multiple medical problems including a severe chronic obstructive pulmonary disease and insulin dependent diabetes mellitus . She fell in October , 1998 , which may have been secondary to a left hip dislocation . She has sustained a right inter-trochanteric hip fracture both treated at Hoseocon Medical Center and transferred to the Heaonboburg Linpack Grant Medical Center for further care by Dr. Stable . She has a right hip dynamic hip screw in place for the int

In [9]:
label_out

'B-ID B-HOSPITAL B-ID B-ID B-DATE O O O O O O O O O O O O O O O O O O B-PATIENT I-PATIENT I-PATIENT O O O B-ID O O O B-DATE O O O B-DATE O O O O O O O O O O O O O O O O O O O O O B-DOCTOR I-DOCTOR O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-DATE O O O O O O O O O O O O O O O O O O O O O O O O O B-HOSPITAL I-HOSPITAL I-HOSPITAL O O O O B-HOSPITAL I-HOSPITAL I-HOSPITAL I-HOSPITAL I-HOSPITAL O O O O O B-DOCTOR O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-DATE O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-DATE O O O O O O O O O O O O O O O O O B-LOC

Parse all records

In [10]:
for record in tqdm(records):
    record_id = record['id']
    text = record.find('text')
    text_w_labels = str(text)
    text_wout_labels = str(text.text)
    phis = text.find_all('phi')
    start_idx = 0
    end_idx = len(text_w_labels)
    phi_array = []
    for phi in phis:
        phi_type = phi['type']
        phi_text = phi.text
        label_prefix = 'B-'
        for phi_text_token in phi_text.split():
            phi_text_len = len(phi_text_token)
            phi_start = str(text).find(phi_text_token, start_idx, end_idx)
            phi_end = phi_start + phi_text_len
            if (phi_type == 'DATE') & bool(re.match(r'(0?[1-9]|1[0-2])\/(0?[1-9]|1\d|2\d|3[01])', phi_text_token)):
                if bool(re.match(r'\/\d\d\d\d', str(text)[phi_end+6: phi_end+11])):
                    phi_text_token = phi_text_token + str(text)[phi_end+6: phi_end+11]
                    phi_text_len += 5
                    phi_end += 11
                elif bool(re.match(r'\/\d\d', str(text)[phi_end+6: phi_end+9])):
                    phi_text_token = phi_text_token + str(text)[phi_end+6: phi_end+9]
                    phi_text_len += 3
                    phi_end += 9
            start_idx = phi_end
            phi_array.append([label_prefix + phi_type, phi_text_token])
            label_prefix = 'I-'
    record_out = ''
    label_out = ''
    phi_idx = 0
    for token in text_wout_labels.split():
        record_out += token + ' '
        if phi_idx == len(phi_array):
            label_out += 'O '
        elif token == phi_array[phi_idx][1]:
            label_out += phi_array[phi_idx][0] + ' '
            phi_idx += 1
        else:
            label_out += 'O '
    with open(train_output_dir + '/text/' + f'{record_id}.txt', 'w') as text_file:
        text_file.write(record_out[:-1])
    with open(train_output_dir + '/labels/' + f'{record_id}.txt', 'w') as label_file:
        label_file.write(label_out[:-1])

  0%|          | 0/669 [00:00<?, ?it/s]

##### Test  
the test data is unlabeled

In [11]:
content = []

with open(test_data_path, 'r') as file:
    content = file.readlines()
    content = ''.join(content)
    bs_content = bs(content, 'lxml')
    
records = bs_content.find_all('record')

for record in tqdm(records):
    record_id = record['id']
    text = record.find('text')
    text_wout_labels = str(text.text)
    record_out = ''
    for token in text_wout_labels.split():
        record_out += token + ' '
    with open(test_output_dir + '/text/' + f'{record_id}.txt', 'w') as text_file:
        text_file.write(record_out[:-1])

  0%|          | 0/220 [00:00<?, ?it/s]

### 2006

In [12]:
train_data_dir = os.path.join(
    proj_dir,
    'data/training-PHI-Gold-Set1/'
)
train_output_dir = os.path.join(
    proj_dir,
    'data/n2c2_processed/2006/train/'
)

Parse one record

In [13]:
full_path = os.path.join(train_data_dir, '278-02.xml')
with open(full_path, 'r') as file:
    content = file.readlines()
    content = ''.join(content)
    record = bs(content, 'lxml')
text = record.find('text').text[9:-3]
# Add spaces around punctuation
text = re.sub('([.,!?()])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)
tags = record.find('tags')
labels = [label for label in tags if label != '\n']
label_array = []
for label in labels:
    label_prefix = 'B-'
    label_text = label['text']
    label_text = re.sub('([.,!?()])', r' \1 ', label_text)
    label_text = re.sub('\s{2,}', ' ', label_text)
    for label_token in label_text.split():
        label_token_text = label_token
        label_token_type = label_prefix + label['type']
    #     start = int(label['start'])
    #     end = int(label['end'])
        label_array.append([label_token_type, label_token_text]) 
        label_prefix = 'I-'
record_out = ''
label_out = ''
label_idx = 0
for token in text.split():
    record_out += token + ' '
    if label_idx == len(label_array):
        label_out += 'O '
    elif token == label_array[label_idx][1]:
        label_out += label_array[label_idx][0] + ' '
        label_idx += 1
    else:
        label_out += 'O '

In [14]:
record_out

"Record date: 2095-07-13 Team 1b Medical Intern Admission Note Pt: Jasso , Stephen MR#: 7990698 Date of Admission: 7/05/95 Attending: I . Ellsworth CC: Vertigo , EKG changes HPI: 73 yo with h/o of DM , CAD s/p anteroseptal MI ( 2091 ) and stent placement presents with 2 days of vertigo , most prominent upon wakening in the a . m . He reports feeling as if he is spinning while the room is stable; this sensation is worsened by changes in position or head movement . It diminishes only slightly as the day progresses . He denies any numbness , weakness , tingling , changes in vision or memory . He denies any recent URI . He does report tinnitus which has been present intermittently for years as well as bilateral sensorineural hearing secondary to gunshot exposure . No NV . The sensation of vertigo is superimposed upon one week of gradually progressive loss of balance , described as not knowing where to put his limbs in relation to space . He also reports ~7 days of mild diarrhea in a . m on

In [15]:
label_out

'O O B-DATE O O O O O O O B-PATIENT I-PATIENT I-PATIENT O B-MEDICALRECORD O O O B-DATE O B-DOCTOR I-DOCTOR I-DOCTOR O O O O O O B-AGE O O O O O O O O O O O B-DATE O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-DATE O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-DATE O O O O O O O O O B-DATE O O O O O B-DATE O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Parse all records

In [16]:
for path in tqdm(os.listdir(train_data_dir)):
    record_id = path[:-4]
    # Open and read file
    full_path = os.path.join(train_data_dir, path)
    with open(full_path, 'r') as file:
        content = file.readlines()
        content = ''.join(content)
        record = bs(content, 'lxml')
    # Get text element
    text = record.find('text').text[9:-3]
    # Add spaces around punctuation
    text = re.sub('([.,!?()])', r' \1 ', text)
    text = re.sub('\s{2,}', ' ', text)
    # Get tags element
    tags = record.find('tags')
    labels = [label for label in tags if label != '\n']
    label_array = []
    # Create label matrix
    for label in labels:
        label_prefix = 'B-'
        label_text = label['text']
        label_text = re.sub('([.,!?()])', r' \1 ', label_text)
        label_text = re.sub('\s{2,}', ' ', label_text)
        for label_token in label_text.split():
            label_token_text = label_token
            label_token_type = label_prefix + label['type']
            label_array.append([label_token_type, label_token_text]) 
            label_prefix = 'I-'
    # Create output files: text and labels
    record_out = ''
    label_out = ''
    label_idx = 0
    for token in text.split():
        record_out += token + ' '
        if label_idx == len(label_array):
            label_out += 'O '
        elif token == label_array[label_idx][1]:
            label_out += label_array[label_idx][0] + ' '
            label_idx += 1
        else:
            label_out += 'O '
    # Write output files
    with open(train_output_dir + '/text/' + f'{record_id}.txt', 'w') as text_file:
        text_file.write(record_out[:-1])
    with open(train_output_dir + '/labels/' + f'{record_id}.txt', 'w') as label_file:
        label_file.write(label_out[:-1])

  0%|          | 0/521 [00:00<?, ?it/s]