## Dataset

In [37]:
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re, string
from pathlib import Path
import json

In [7]:
def preprocess_text(text: str, lowercase=True, stopword_removal=True, stopwords_domain=[], min_length=2,  punctuation_removal=True,
                    does_stem=False, does_lemm=False):
    if text is None:
        return ""
    if lowercase:
        text = text.lower()
    if punctuation_removal:
        text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    if stopword_removal:
        stop_words = set(stopwords.words('english') + stopwords_domain)
        tokens = [word for word in tokens if word not in stop_words]
    if does_stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    if does_lemm:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) >= min_length]

    return " ".join(tokens)


In [12]:
path = "Dataset/n2c2/part2/"
files = [Path(f).stem for f in listdir(path=path) if isfile(join(path, f))]
files = list(set(files))
print(len(files))

303


In [26]:
def extract_text(txt_file):
    with open(txt_file, 'r') as f:
        text = f.read()
        return text

pattern = r'^(T\d+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(.*)$'
def extract_ann(ann_file):
    with open(ann_file, 'r') as f:
        lines = f.readlines()
        all_anns = []
        for line in lines:
            match = re.match(pattern, line)
            if match:
                entry = {}
                entry['entity_id'] = match.group(1)
                entry['entity_type'] = match.group(2)
                entry['start_span'] = int(match.group(3))
                entry['end_span'] = int(match.group(4))
                entry['name_or_dosage'] = match.group(5)
                all_anns.append(entry)
        return all_anns

In [35]:
### now extracting data 
dataset = []

for file in tqdm(files, desc="Extracting and Cleaning Dataset"):
    data = {}
    txt = extract_text(path + file + ".txt")
    #TODO doing the desired preprocessing on txt
    txt = preprocess_text(text=txt, lowercase=True, stopword_removal=True, min_length=2,
                           punctuation_removal=True, does_stem=False, does_lemm=False)
    data['text'] = txt
    anns = extract_ann(path + file + ".ann")
    #TODO doing the desired preprocessing on anns
    for ann in anns:
        ann['name_or_dosage'] = preprocess_text(text=ann['name_or_dosage'], lowercase=True, min_length=1)

    data['anns'] = anns
    
    dataset.append(data)


Extracting and Cleaning Dataset: 100%|██████████| 303/303 [00:06<00:00, 46.84it/s]


In [36]:
print(len(dataset))

303


In [39]:

df = pd.DataFrame(dataset)
df.to_csv("Dataset/preprocessed/part2.csv")

output_json = "Dataset/preprocessed/part2.json"
with open(output_json, 'w') as f:
    json.dump(dataset, f, indent=4)

print("Dataset saved")

Dataset saved
