## Dataset

In [9]:
import xml.etree.ElementTree as ET
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re, string



In [10]:
path = "Dataset/n2c2/part1/"
files = [f for f in listdir(path=path) if isfile(join(path, f))]
print(len(files))

202


In [11]:
def preprocess_text(text: str, lowercase=True, stopword_removal=True, stopwords_domain=[], min_length=2,  punctuation_removal=True,
                    does_stem=False, does_lemm=False):
    if text is None:
        return ""
    if lowercase:
        text = text.lower()
    if punctuation_removal:
        text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    if stopword_removal:
        stop_words = set(stopwords.words('english') + stopwords_domain)
        tokens = [word for word in tokens if word not in stop_words]
    if does_stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    if does_lemm:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) >= min_length]

    return " ".join(tokens)


In [12]:
dataset = []
for f_name in tqdm(files, desc="Extracting Data"):
    tree = ET.parse(path + f_name)
    root = tree.getroot()
    entry = {}
    text = root.find("TEXT")
    entry['text'] = preprocess_text(text=text.text, lowercase=True, stopword_removal=True, min_length=2, punctuation_removal=True,
                                    does_lemm=False, does_stem=False)
    tags = root.find("TAGS")   
    major_diabetes = tags.find("MAJOR-DIABETES")
    entry["major_diabetes"] = major_diabetes.attrib.get("met")
    abdominal = tags.find("ABDOMINAL")
    entry["abdominal"] = abdominal.attrib.get("met")
    creatinine = tags.find("CREATININE")
    entry["creatinine"] = creatinine.attrib.get("met")

    dataset.append(entry)


###  just for checking ####
# print(dataset[1]["abdominal"])
# print(dataset[1]["major_diabetes"])
# print(dataset[1]["creatinine"])



Extracting Data: 100%|██████████| 202/202 [00:01<00:00, 140.79it/s]


In [13]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,text,major_diabetes,abdominal,creatinine
0,record date 20631128 hpi 51 yo mmp comes estab...,met,met,not met
1,record date 20940129 internal medicine associa...,met,not met,met
2,record date 21210331 kern medical center pread...,met,met,met
3,record date 20691118 mr gallegos comes clinic ...,met,not met,not met
4,record date 20870221 dameron emergency dept vi...,met,met,not met


In [14]:
df.to_csv("Dataset/preprocessed/part1.csv", index=False)

In [15]:
df = pd.read_csv("Dataset/preprocessed/part1.csv")

In [16]:
df.head()

Unnamed: 0,text,major_diabetes,abdominal,creatinine
0,record date 20631128 hpi 51 yo mmp comes estab...,met,met,not met
1,record date 20940129 internal medicine associa...,met,not met,met
2,record date 21210331 kern medical center pread...,met,met,met
3,record date 20691118 mr gallegos comes clinic ...,met,not met,not met
4,record date 20870221 dameron emergency dept vi...,met,met,not met


In [4]:
from sklearn.model_selection import train_test_split


In [17]:
train_size = 0.8
test_size = 0.1
validation_size = 0.1

train_df, temp_df = train_test_split(df, test_size= 1 - train_size, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size= test_size / (test_size + validation_size), random_state=42, shuffle=True)


In [18]:
len_total = len(train_df) + len(val_df ) + len(test_df)
print(f"train size : {len(train_df)}/{len_total} = {len(train_df)/len_total}")
print(f"test size : {len(test_df)}/{len_total} = {len(test_df)/len_total}")
print(f"val size : {len(val_df)}/{len_total} = {len(test_df)/len_total}")

train size : 161/202 = 0.7970297029702971
test size : 21/202 = 0.10396039603960396
val size : 20/202 = 0.10396039603960396


In [19]:
train_df.to_csv("Dataset/preprocessed/part1/train.csv", index=False)
val_df.to_csv("Dataset/preprocessed/part1/val.csv", index=False)
test_df.to_csv("Dataset/preprocessed/part1/test.csv", index=False)