## Dataset

In [67]:
import xml.etree.ElementTree as ET
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re, string


In [62]:
path = "Dataset/n2c2/part1/"
files = [f for f in listdir(path=path) if isfile(join(path, f))]
print(len(files))

202


In [68]:
def preprocess_text(text: str, lowercase=True, stopword_removal=True, stopwords_domain=[], min_length=2,  punctuation_removal=True,
                    does_stem=False, does_lemm=False):
    if text is None:
        return ""
    if lowercase:
        text = text.lower()
    if punctuation_removal:
        text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    if stopword_removal:
        stop_words = set(stopwords.words('english') + stopwords_domain)
        tokens = [word for word in tokens if word not in stop_words]
    if does_stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    if does_lemm:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) >= min_length]

    return " ".join(tokens)


In [71]:
dataset = []
for f_name in tqdm(files, desc="Extracting Data"):
    tree = ET.parse(path + f_name)
    root = tree.getroot()
    entry = {}
    text = root.find("TEXT")
    entry['text'] = preprocess_text(text=text.text, lowercase=True, stopword_removal=True, min_length=2, punctuation_removal=True,
                                    does_lemm=False, does_stem=False)
    tags = root.find("TAGS")   
    major_diabetes = tags.find("MAJOR-DIABETES")
    entry["major_diabetes"] = major_diabetes.attrib.get("met")
    abdominal = tags.find("ABDOMINAL")
    entry["abdominal"] = abdominal.attrib.get("met")
    creatinine = tags.find("CREATININE")
    entry["creatinine"] = creatinine.attrib.get("met")

    dataset.append(entry)


###  just for checking ####
# print(dataset[1]["abdominal"])
# print(dataset[1]["major_diabetes"])
# print(dataset[1]["creatinine"])



Extracting Data: 100%|██████████| 202/202 [00:00<00:00, 205.53it/s]


In [72]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,text,major_diabetes,abdominal,creatinine
0,record date 21430403 pl neurology promptcarele...,met,not met,not met
1,record date 21450813 august 12 2145 dr mabel d...,met,not met,met
2,record date 20880626 personal data overall hea...,met,met,not met
3,record date 20690917 office note andrew conner...,not met,not met,not met
4,record date 20910702 cardiology beauregard mem...,not met,not met,not met


In [73]:
df.to_csv("Dataset/preprocessed/part1.csv")