## Dataset

In [110]:
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re, string
from pathlib import Path
import json

In [111]:
def preprocess_text(text: str, lowercase=True, stopword_removal=True, stopwords_domain=[], min_length=2,  punctuation_removal=True,
                    does_stem=False, does_lemm=False):
    if text is None:
        return ""
    if lowercase:
        text = text.lower()
        return text
    if punctuation_removal:
        text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    if stopword_removal:
        stop_words = set(stopwords.words('english') + stopwords_domain)
        tokens = [word for word in tokens if word not in stop_words]
    if does_stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    if does_lemm:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) >= min_length]

    return " ".join(tokens)


In [112]:
path = "Dataset/n2c2/part2/"
files = [Path(f).stem for f in listdir(path=path) if isfile(join(path, f))]
files = list(set(files))
print(len(files))

303


In [113]:
def extract_text(txt_file):
    with open(txt_file, 'r') as f:
        text = f.read()
        return text

pattern = r'^(T\d+)\s+(\w+)\s+(\d+)\s+(\d+)\s+(.*)$'
def extract_ann(ann_file):
    with open(ann_file, 'r') as f:
        lines = f.readlines()
        all_anns = []
        for line in lines:
            match = re.match(pattern, line)
            if match:
                entry = {}
                entry['entity_id'] = match.group(1)
                entry['entity_type'] = match.group(2)
                entry['start_span'] = int(match.group(3))
                entry['end_span'] = int(match.group(4))
                entry['name_or_dosage'] = match.group(5)
                all_anns.append(entry)
        return all_anns


In [115]:
### now extracting data 
dataset = []

for file in tqdm(files, desc="Extracting and Cleaning Dataset"):
    data = {}
    txt = extract_text(path + file + ".txt")
    #TODO doing the desired preprocessing on txt
    txt = preprocess_text(text=txt, lowercase=True, stopword_removal=False, min_length=2,
                           punctuation_removal=False, does_stem=False, does_lemm=False)
    data['text'] = txt
    anns = extract_ann(path + file + ".ann")
    #TODO doing the desired preprocessing on anns
    for ann in anns:
        ann['name_or_dosage'] = preprocess_text(text=ann['name_or_dosage'], lowercase=True, min_length=1, punctuation_removal=False, stopword_removal=False)

    data['anns'] = anns
    
    dataset.append(data)


Extracting and Cleaning Dataset: 100%|██████████| 303/303 [00:00<00:00, 2328.25it/s]


In [116]:
print(len(dataset))
df = pd.DataFrame(dataset)

303


In [117]:
df.head()

Unnamed: 0,text,anns
0,admission date: [**2200-5-18**] ...,"[{'entity_id': 'T1', 'entity_type': 'Drug', 's..."
1,admission date: [**2147-10-12**] ...,"[{'entity_id': 'T3', 'entity_type': 'Drug', 's..."
2,admission date: [**2170-10-5**] ...,"[{'entity_id': 'T1', 'entity_type': 'Reason', ..."
3,admission date: [**2136-7-18**] ...,"[{'entity_id': 'T1', 'entity_type': 'Drug', 's..."
4,admission date: [**2199-5-1**] d...,"[{'entity_id': 'T1', 'entity_type': 'Drug', 's..."


In [118]:
df.to_csv("Dataset/preprocessed/part2.csv", index=False)

output_json = "Dataset/preprocessed/part2.json"
with open(output_json, 'w') as f:
    json.dump(dataset, f, indent=4)

print("Dataset saved")

Dataset saved


In [119]:
df = pd.read_csv("Dataset/preprocessed/part2.csv")

In [120]:
df.head()

Unnamed: 0,text,anns
0,admission date: [**2200-5-18**] ...,"[{'entity_id': 'T1', 'entity_type': 'Drug', 's..."
1,admission date: [**2147-10-12**] ...,"[{'entity_id': 'T3', 'entity_type': 'Drug', 's..."
2,admission date: [**2170-10-5**] ...,"[{'entity_id': 'T1', 'entity_type': 'Reason', ..."
3,admission date: [**2136-7-18**] ...,"[{'entity_id': 'T1', 'entity_type': 'Drug', 's..."
4,admission date: [**2199-5-1**] d...,"[{'entity_id': 'T1', 'entity_type': 'Drug', 's..."


In [128]:
r = list(eval(df["anns"].iloc[0]))
for k in r[0]:
    print(k, "-->>", r[0][k])

entity_id -->> T1
entity_type -->> Drug
start_span -->> 1794
end_span -->> 1802
name_or_dosage -->> percocet


In [129]:
print((df["anns"].iloc[0]))

[{'entity_id': 'T1', 'entity_type': 'Drug', 'start_span': 1794, 'end_span': 1802, 'name_or_dosage': 'percocet'}, {'entity_id': 'T3', 'entity_type': 'Strength', 'start_span': 8213, 'end_span': 8218, 'name_or_dosage': '10 mg'}, {'entity_id': 'T4', 'entity_type': 'Form', 'start_span': 8239, 'end_span': 8245, 'name_or_dosage': 'tablet'}, {'entity_id': 'T6', 'entity_type': 'Route', 'start_span': 8246, 'end_span': 8248, 'name_or_dosage': 'po'}, {'entity_id': 'T8', 'entity_type': 'Drug', 'start_span': 6615, 'end_span': 6623, 'name_or_dosage': 'percocet'}, {'entity_id': 'T10', 'entity_type': 'Form', 'start_span': 6822, 'end_span': 6828, 'name_or_dosage': 'liquid'}, {'entity_id': 'T11', 'entity_type': 'Dosage', 'start_span': 6834, 'end_span': 6842, 'name_or_dosage': '5-10 mls'}, {'entity_id': 'T13', 'entity_type': 'Route', 'start_span': 6843, 'end_span': 6845, 'name_or_dosage': 'po'}, {'entity_id': 'T16', 'entity_type': 'Strength', 'start_span': 7971, 'end_span': 7977, 'name_or_dosage': '100 mg

In [130]:
print(df["text"].iloc[0][8213:8218])

10 mg


In [145]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(df, test_size=0.21, random_state=42)
val, test = train_test_split(temp, test_size=0.50, random_state=42)

In [146]:
print(f"len train : {len(train)}")
print(f"len val : {len(val)}")
print(f"len test : {len(test)}")
print(f"train ratio = {len(train)/(len(train) + len(test) + len(val))}")
print(f"val ratio = {len(val)/(len(train) + len(test) + len(val))}")
print(f"test ratio = {len(test)/(len(train) + len(test) + len(val))}")

len train : 239
len val : 32
len test : 32
train ratio = 0.7887788778877888
val ratio = 0.10561056105610561
test ratio = 0.10561056105610561


In [147]:
train.to_csv("Dataset/preprocessed/part2/train.csv", index=False)
val.to_csv("Dataset/preprocessed/part2/val.csv", index=False)
test.to_csv("Dataset/preprocessed/part2/test.csv", index=False)


train_json = "Dataset/preprocessed/part2/train.json"
val_json = "Dataset/preprocessed/part2/val.json"
test_json = "Dataset/preprocessed/part2/test.json"

with open(train_json, 'w') as f:
    json.dump(train.to_dict(orient='records'), f, indent=4)

with open(val_json, 'w') as f:
    json.dump(val.to_dict(orient='records'), f, indent=4)

with open(test_json, 'w') as f:
    json.dump(test.to_dict(orient='records'), f, indent=4)

print("Datasets saved")


Datasets saved
