In [18]:

import os
import re
from typing import Dict
import csv
import requests

import argparse
import json
from functools import partial
from typing import Any

import nemo_curator as nc
from nemo_curator import ScoreFilter, Sequential
from nemo_curator.filters import RepeatingTopNGramsFilter, WordCountFilter
from nemo_curator.datasets import DocumentDataset
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
from nemo_curator.modules.modify import Modify
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import add_distributed_args

from nemo_curator.download.doc_builder import (
    DocumentDownloader,
    DocumentExtractor,
    DocumentIterator,
)

In [9]:
class MedicalTextDownloader(DocumentDownloader):
    def __init__(self, download_dir: str):
        super().__init__()

        if not os.path.isdir(download_dir):
            os.makedirs(download_dir)

        self._download_dir = download_dir
        print("Download directory: ", self._download_dir)

    def download(self, url: str) -> str:
        filename = os.path.basename(url)
        filename = re.search(".+\.csv",filename).group()
        output_file = os.path.join(self._download_dir, filename)

        if os.path.exists(output_file):
            print(f"File '{output_file}' already exists, skipping download.")
            return output_file

        print(f"Downloading medical text dataset from '{url}'...")
        response = requests.get(url)

        with open(output_file, "wb") as file:
            file.write(response.content)

        return output_file

In [10]:
class MedicalTextIterator(DocumentIterator):

    def __init__(self):
        super().__init__()
        self._counter = -1


    def iterate(self, file_path):
        self._counter = -1
        file_name = os.path.basename(file_path)

        with open(file_path, "r", encoding="utf-8") as file:
            lines = file.readlines()

        # Ignore the first line which contains the header.
        file_content = "".join(lines[1:])

        for line in file_content.split("\n"):
            self._counter += 1
            meta = {
                "filename": file_name,
                "id": f"medical_text-{self._counter}",
            }
            content = {"condition label":line[0:1],
                       "medical abstract":line[3:-1]}
            
            #print(content)
            
            
            # Skip if no content extracted
            if not content:
                continue

            record = {**meta, **content}
            yield record



In [11]:
DATA_DIR = "./download_text"
DATASET_URL = "https://huggingface.co/datasets/123rc/medical_text/resolve/main/test.csv?download=true"

def download_and_convert_to_jsonl() -> str:
    """
    Downloads the emails dataset and converts it to JSONL format.

    Returns:
        str: The path to the JSONL file.
    """

    # Download the dataset in raw format and convert it to JSONL.
    downloader = MedicalTextDownloader(DATA_DIR)
    output_path = os.path.join(DATA_DIR, "medical_texts.jsonl")
    raw_fp = downloader.download(DATASET_URL)

    iterator = MedicalTextIterator()

    # Parse the raw data and write it to a JSONL file.
    with open(output_path, "w") as f:
        for record in iterator.iterate(raw_fp):
            json_record = json.dumps(record, ensure_ascii=False)
            f.write(json_record + "\n")

    return output_path

In [5]:
path = download_and_convert_to_jsonl()

Download directory:  ./download_text
File './download_text/test.csv' already exists, skipping download.


In [21]:

def redact_pii(dataset: DocumentDataset, text_field) -> DocumentDataset:
    """
    Redacts personally identifiable information (PII) from a given dataset.

    Args:
        dataset (DocumentDataset): The dataset containing documents with PII.

    Returns:
        DocumentDataset: The redacted dataset with PII replaced by a generic value.
    """
    redactor = Modify(
        PiiModifier(
            supported_entities=[
                "PERSON",
            ],
            anonymize_action="replace",
            device="gpu",
        ),
        text_field=text_field,
    )
    return redactor(dataset)


def run_curation_pipeline(args:Any, jsonl_fp: str) -> str:
    """
    Run the curation pipeline on the dataset.

    Args:
        args (Any): Command-line arguments.
        jsonl_fp (str): The path to the uncurated JSONL file.

    Returns:
        str: The path to the curated JSONL file.
    """
    client = get_client(args, args.device)
    print(f"    Running the curation pipeline on '{jsonl_fp}'...")
    orig_dataset = DocumentDataset.read_json(jsonl_fp, add_filename=True)
    dataset = orig_dataset

    redact_pii_MA = partial(redact_pii, text_field="medical abstract")

    curation_steps = Sequential(
        [
            #
            # Unify the text encoding to Unicode.
            #
            Modify(UnicodeReformatter(), text_field="medical abstract"),
            #
            # Filtering
            #
            # Filter out empty emails.
            #
            # Redact personally identifiable information (PII).
            #
            redact_pii_MA,
            nc.Score(
                WordCountFilter(min_words=80).score_document,
                text_field="medical abstract",
                score_field="word_count",
                score_type=int,
            )
            
        ]
    )

    dataset = curation_steps(dataset)
    dataset = dataset.persist()

    print(f"    Original dataset length: {len(orig_dataset.df)}")
    print(f"    After running the curation pipeline: {len(dataset.df)}")
    print(f"    Writing to '{jsonl_fp}'...")
    out_path = os.path.join(
        os.path.dirname(jsonl_fp),
        "curated",
    )
    os.makedirs(out_path, exist_ok=True)
    dataset.to_json(out_path, write_to_filename=True)
    client.close()
    return os.path.join(out_path, os.path.basename(jsonl_fp))


def main():
    parser = argparse.ArgumentParser()
    parser = add_distributed_args(parser)
    args = parser.parse_args(args=[])
    # Limit the total number of workers to ensure we don't run out of memory.
    args.n_workers = min(args.n_workers, 1)

    # Prepare the download and JSONL directories.
    if not os.path.isdir(DATA_DIR):
        os.makedirs(DATA_DIR)

    jsonl_fp = download_and_convert_to_jsonl()
    run_curation_pipeline(args, jsonl_fp)


In [22]:
main()

Download directory:  ./download_text
File './download_text/test.csv' already exists, skipping download.


Perhaps you already have a cluster running?
Hosting the HTTP server on port 35989 instead


    Running the curation pipeline on './download_text/medical_texts.jsonl'...
Reading 1 files


2024-05-19 03:47:01 INFO:Loaded recognizer: EmailRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: PhoneRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: EmailRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: SpacyRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: PhoneRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: UsSsnRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: SpacyRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: CreditCardRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: UsSsnRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: IpRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: CreditCardRecognizer
2024-05-19 03:47:01 INFO:Loaded recognizer: IpRecognizer


    Original dataset length: 2889
    After running the curation pipeline: 2889
    Writing to './download_text/medical_texts.jsonl'...
Writing to disk complete for 1 partitions
