## About the data: 

* Extracted from: https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/
* Each ID has a text document (discharge summary)
* Each text document was annotated with the presence of one or more disease (16 types of diseases)
* Here we focus on the presence/absence of one disease condition at a time as a binary classification problem

### 1. Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

# For importing .xml files
import xml.etree.ElementTree as ET

# For handeling dataframes
import pandas as pd
import numpy as np
import re                                  # library for regular expression operations
import string                              # for string operations

### 2. Import text data
The datasets on the Harvard website come as multiple datasets in .xml format, and the text data and labels data are stored in separate files. After importing all the text data and the label data, we combine them into one single dataset.

In [2]:
# Read the 1st set of text data
file = 'data/obesity_patient_records_training.xml'
root = ET.parse(file).getroot()

text = []
ids = []
for i, child in enumerate(root):
    for _, subchild in enumerate(child):
        temp = [element.text for element in subchild]
        text.append(temp[0])
        ids.append(subchild.attrib["id"])

# Create a dataframe 
text_1 = pd.DataFrame(list(zip(ids, text)), columns=["id", "text"])

In [3]:
# Read the 2nd set of text data
file = 'data/obesity_patient_records_training2.xml'
root = ET.parse(file).getroot()

text = []
ids = []
for i, child in enumerate(root):
    for _, subchild in enumerate(child):
        temp = [element.text for element in subchild]
        text.append(temp[0])
        ids.append(subchild.attrib["id"])

# Create a dataframe 
text_2 = pd.DataFrame(list(zip(ids, text)), columns=["id", "text"])

In [4]:
# Read the 3rd set of text data
file = 'data/obesity_patient_records_test.xml'
root_test = ET.parse(file).getroot()

text = []
ids = []
for i, child in enumerate(root_test):
    for _, subchild in enumerate(child):
        temp = [element.text for element in subchild]
        text.append(temp[0])
        ids.append(subchild.attrib["id"])

# Create a dataframe 
text_3 = pd.DataFrame(list(zip(ids, text)), columns=["id", "text"])

In [5]:
# Combine all three text datasets
text_all = text_1.append(text_2.append(text_3))

In [7]:
# Check for duplicates
# np.sum(text_all.duplicated('id'))

### 3. Import labels data

In [6]:
# Read the 1st set of the labels data
file = 'data/obesity_standoff_intuitive_annotations_training.xml'
root_truth = ET.parse(file).getroot()

disease = []
label = []
ids = []
for i, child in enumerate(root_truth):
    for _, subchild in enumerate(child):       
        for _, element in enumerate(subchild):     
            disease.append(subchild.attrib['name'])
            label.append(element.attrib['judgment'])
            ids.append(element.attrib["id"])

# Create a dataframe 
labels_1 = pd.DataFrame(list(zip(ids, label, disease)), columns=["id", "label", "disease"])

In [7]:
# Read the 2nd set of the labels data
file = 'data/obesity_standoff_annotations_training_addendum3.xml'
root_test = ET.parse(file).getroot()

disease = []
label = []
ids = []
for i, child in enumerate(root_truth):
    for _, subchild in enumerate(child):       
        for _, element in enumerate(subchild):     
            disease.append(subchild.attrib['name'])
            label.append(element.attrib['judgment'])
            ids.append(element.attrib["id"])

# Create a dataframe 
labels_2 = pd.DataFrame(list(zip(ids, label, disease)), columns=["id", "label", "disease"])

In [8]:
# Read the 3rd set of the labels data
file = 'data/obesity_standoff_annotations_test_intuitive.xml'
root_truth = ET.parse(file).getroot()

disease = []
label = []
ids = []
for i, child in enumerate(root_truth):
    for _, subchild in enumerate(child):       
        for _, element in enumerate(subchild):     
            disease.append(subchild.attrib['name'])
            label.append(element.attrib['judgment'])
            ids.append(element.attrib["id"])

# Create a dataframe 
labels_3 = pd.DataFrame(list(zip(ids, label, disease)), columns=["id", "label", "disease"])

In [9]:
# Combine the 3 labels datasets
labels_all = labels_3.append(labels_1.append(labels_2))

In [10]:
# One patient can have multiple diseases
labels_all["disease"].value_counts()

Gout                    1696
OSA                     1689
Gallstones              1677
Hypertriglyceridemia    1660
Depression              1641
Diabetes                1623
Asthma                  1615
OA                      1594
PVD                     1579
CAD                     1562
Obesity                 1555
Hypertension            1508
Venous Insufficiency    1479
Hypercholesterolemia    1437
GERD                    1402
CHF                      924
Name: disease, dtype: int64

In [12]:
text_all.shape

(1237, 2)

In [13]:
# Save the data as csv files
text_all.to_csv("data/text_all.csv")
labels_all.to_csv("data/labels_all.csv")