In [2]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes


In [3]:
!pip install --upgrade --quiet langchain neo4j langchain

In [4]:
!pip install -q langchain-community



In [5]:
import pandas as pd

# Define the bucket and file names
bucket_name = 'mimicivliza'  # Replace with your bucket name
mimic_iv_bhc = f's3://{bucket_name}/sample_data_100.csv'

# Load the files
mimic_iv_bhc_100 = pd.read_csv(mimic_iv_bhc)

# Display the data
mimic_iv_bhc_100.head(20)

Unnamed: 0,note_id,input,target,input_tokens,target_tokens
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,This is a ___ yo F admitted to the hospital af...,1195,75
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,Mr. ___ is a ___ yo man with CAD with prior MI...,3496,1143
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,Mr. ___ is a ___ w/ Ph+ve ALL on dasatanib and...,5591,1098
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,"On ___, Ms. ___ was admitted to the gynecology...",1119,221
4,17967161-DS-29,<SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...,Mr. ___ underwent an angiogram on ___ which sh...,3307,439
5,16956007-DS-20,<SEX> M <SERVICE> SURGERY <ALLERGIES> Codeine ...,Mr. ___ is a ___ who underwent an exploratory ...,4168,1209
6,16919911-DS-15,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Penicil...,This is a ___ year old female with a recent di...,2059,208
7,15682570-DS-25,<SEX> M <SERVICE> MEDICINE <ALLERGIES> No Know...,"___ w/ h/o CAD ___ CABG LIMA to LAD, SVG to D1...",2215,451
8,12135369-DS-24,<SEX> F <SERVICE> MEDICINE <ALLERGIES> Compazi...,Ms ___ is a ___ year old woman with a history ...,2132,416
9,11906321-DS-20,<SEX> M <SERVICE> NEUROSURGERY <ALLERGIES> Pat...,The patient was admitted to the neurosurgery s...,2347,316


In [6]:
print(mimic_iv_bhc_100['input'].iloc[0])

<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine / Thallium-201 / Blue Dye / Iodine-Iodine Containing <ATTENDING> ___. <CHIEF COMPLAINT> Morbid obesity, BMI of 51 <MAJOR SURGICAL OR INVASIVE PROCEDURE> lap gastric bypass <HISTORY OF PRESENT ILLNESS> The patient is a ___ woman with history of obesity, multiple medical problems with a history of 7 pound weight loss and regain. Comorbid conditions include sleep apnea, hypothyroidism, back pain, iron deficiency anemia and headaches. The patient has significant allergies particularly to the blue dye and iodine. The patient was evaluated at ___ ___ ___ Program deemed a good candidate for surgical weight loss. She understands the risks, benefits and alternatives of weight loss surgery. She agrees to diet, exercise, support group and lifelong medical follow-up particularly for B12, calcium and folate levels. <PAST MEDICAL HISTORY> Past medical history includes sleep apnea, hypothyroidism, back pain, urticaria for which she is on chronic steroids, 

In [7]:
# Extract 'note_id' and 'input' columns and save to a new dataframe
NER = mimic_iv_bhc_100[['note_id', 'input']]

# Display the first few rows of the new dataframe to verify
NER.head(2)
print(NER[['note_id', 'input']].iloc[0])

note_id                                       16002318-DS-17
input      <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...
Name: 0, dtype: object


In [9]:
print(NER['input'].iloc[9])

<SEX> M <SERVICE> NEUROSURGERY <ALLERGIES> Patient recorded as having No Known Allergies to Drugs <ATTENDING> ___. <CHIEF COMPLAINT> headache past ten days <MAJOR SURGICAL OR INVASIVE PROCEDURE> Image guided right craniotomy for tumor resection <HISTORY OF PRESENT ILLNESS> Mr. ___ is a ___ y/o male who was in good health until 10 days ago, when he began having gradually worsening headaches. These were right-sided and throbbing and initially controlled with over the counter analgesics, but are now refractory to these. He has had no visual changes in association, no N/V or drowsiness. He does not appreciate worsening with cough/strain or sneezing. The patient denies other difficulties, such as weakness, numbness/tingling, visual loss or diplopia, speech abnormalities, gait difficulties, vertigo, dysarthria or dysphagia. He was taken to an outside hospital today, where head CT revealed an intracranial mass lesion. He was given decadron and sent to ___ ER. <PAST MEDICAL HISTORY> renal cell

In [10]:
from huggingface_hub import login

# Use your Hugging Face token
login("")

In [11]:
NER.nunique()

note_id    100
input      100
dtype: int64

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
from typing import Dict, List
import torch
from tqdm import tqdm


class MimicNERProcessor:
    def __init__(self, confidence_threshold: float = 0.75, max_token_length: int = 512, overlap: int = 50):
        """
        Initialize the MimicNERProcessor with a specified confidence threshold.
        Uses the samrawal/bert-base-uncased_clinical-ner model for NER.
        """
        print("\nInitializing MimicNERProcessor...")
        self.model_name = "samrawal/bert-base-uncased_clinical-ner"
        print(f"Loading tokenizer from {self.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        print("Loading model...")
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        self.confidence_threshold = confidence_threshold
        self.max_token_length = max_token_length
        self.overlap = overlap

        device = 0 if torch.cuda.is_available() else -1
        print(f"Using device: {'cuda' if device == 0 else 'cpu'}")
        print("Initializing NER pipeline...")
        self.ner_pipeline = pipeline(
            "ner",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple",
            device=device
        )
        print("NER pipeline initialized successfully")

    def preprocess_text(self, text: str) -> str:
        """
        Preprocess the input text by ensuring it's a string and removing NaN values.
        """
        if pd.isna(text):
            print("Found NaN text")
            return ""
        return str(text).strip()

    def split_text_into_chunks(self, text: str) -> List[str]:
        """
        Split text into overlapping chunks that fit the model's token limit.
        """
        encoded = self.tokenizer.encode_plus(
            text,
            max_length=self.max_token_length,
            truncation=False,
            return_offsets_mapping=False,
        )
        input_ids = encoded['input_ids']
        chunks = []

        for i in range(0, len(input_ids), self.max_token_length - self.overlap):
            chunk = input_ids[i:i + self.max_token_length]
            chunks.append(self.tokenizer.decode(chunk, skip_special_tokens=True))
        return chunks

    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """
        Extract entities from the clinical note using the NER pipeline.
        Handles subword tokens to merge into meaningful entities.
        """
        text = self.preprocess_text(text)
        if not text:
            return {"PROBLEM": [], "TREATMENT": [], "TEST": []}

        try:
            chunks = self.split_text_into_chunks(text)
            entities = {"PROBLEM": [], "TREATMENT": [], "TEST": []}
            current_entity = ""
            last_entity_type = None

            for chunk in chunks:
                ner_results = self.ner_pipeline(chunk)

                for entity in ner_results:
                    if entity['score'] > self.confidence_threshold:
                        entity_text = entity['word']
                        entity_type = entity['entity_group'].upper()

                        # Merge subwords
                        if entity_text.startswith("##"):
                            current_entity += entity_text[2:]  # Append subword
                        else:
                            # Save the last entity if complete
                            if current_entity and last_entity_type == entity_type:
                                entities[last_entity_type].append(current_entity.strip())
                            current_entity = entity_text
                            last_entity_type = entity_type

            # Add the last buffered entity
            if current_entity and last_entity_type:
                entities[last_entity_type].append(current_entity.strip())

            # Remove duplicates and sort entities
            return {k: sorted(list(set(v))) for k, v in entities.items()}

        except Exception as e:
            print(f"Error in extract_entities: {str(e)}")
            return {"PROBLEM": [], "TREATMENT": [], "TEST": []}


def process_dataframe(
    df: pd.DataFrame,
    text_column: str = 'input',
    confidence_threshold: float = 0.75
) -> pd.DataFrame:
    """
    Process clinical notes DataFrame to extract entities using MimicNERProcessor.
    """
    print("\n=== Starting DataFrame Processing ===")
    processor = MimicNERProcessor(confidence_threshold=confidence_threshold)
    result_df = df.copy()

    tqdm.pandas(desc="Extracting entities")
    result_df['entities'] = result_df[text_column].progress_apply(processor.extract_entities)

    # Expand entities into separate columns
    result_df['problems'] = result_df['entities'].apply(lambda x: x.get("PROBLEM", []))
    result_df['treatments'] = result_df['entities'].apply(lambda x: x.get("TREATMENT", []))
    result_df['tests'] = result_df['entities'].apply(lambda x: x.get("TEST", []))

    # Print extraction statistics
    print("\nExtraction Statistics:")
    print(f"Notes with problems: {sum(len(p) > 0 for p in result_df['problems'])}")
    print(f"Notes with treatments: {sum(len(p) > 0 for p in result_df['treatments'])}")
    print(f"Notes with tests: {sum(len(p) > 0 for p in result_df['tests'])}")

    return result_df


if __name__ == "__main__":
    print("\n=== Starting NER Processing ===")
    if 'NER' not in globals():
        raise ValueError("The NER DataFrame is not defined!")
    
    processed_df = process_dataframe(
        df=NER,
        text_column='input',
        confidence_threshold=0.2
    )

    print("\nResults preview:")
    print(processed_df[['problems', 'treatments', 'tests']].head())



=== Starting NER Processing ===

=== Starting DataFrame Processing ===

Initializing MimicNERProcessor...
Loading tokenizer from samrawal/bert-base-uncased_clinical-ner


tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Loading model...


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cuda:0


Using device: cuda
Initializing NER pipeline...
NER pipeline initialized successfully


Extracting entities:   2%|▏         | 2/100 [00:10<08:26,  5.17s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Extracting entities: 100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Extraction Statistics:
Notes with problems: 100
Notes with treatments: 99
Notes with tests: 98

Results preview:
                                            problems  \
0  [101, 7 pound weight loss, a fever, a lower ab...   
1  [+, -, 1 cm area, a " cyst ", a 2cm distal cbd...   
2  [a 0. 7 x 0. 7 x 0. 7 cm simple cyst, a 2. 1 x...   
3  [a third - degree uterine prolapse, abnormal v...   
4  [101, abuse, acute pancreatitis, angina, anter...   

                                          treatments  \
0  [abdominal exercises, albuterol sulfate, all, ...   
1  [a bankart repair, a nicotine patch, a stent, ...   
2  [2, a prolonged course, ampicillin, anti -, an...   
3  [a stool softener, acetaminophen, admissionmis...   
4  [a, a 3 mm x 40 mm balloon percutaneous transl...   

                                               tests  
0                 [b12, bmi, calcium, physical exam]  
1  [., _, a, a ct scan, absbaso, abseos, abslymp,...  
2  [16s rdna primer set, aa, abl, acid fast cul




In [13]:
print(processed_df['tests'].iloc[9])

['- to - shin', 'alternating movements', 'anion gap', 'basos', 'br bi tri pat ach toes l', 'chloride', 'creat', 'ct of the head', 'double simultaneous stimulation', 'eos', 'estgfr', 'finger - to - nose', 'glucose', 'hct', 'hgb', 'joint position sense', 'light touch', 'lymphs', 'mch', 'mchc', 'mcv', 'monos', 'multiplanar t1 - and t2 - weighted imaging of the head', 'neuts', 'our', 'p resection', 'pinprick', 'plt count', 'potassium', 'prior studies', 'ptt', 'r', 'rbc', 'rdw', 'reflexes', 'resection', 'results', 'sodium', 'the', 'the ct angiogram', 'the post - contrast images', 'total co2', 'urea n', 'wbc']


In [14]:
processed_df.head(5)

Unnamed: 0,note_id,input,entities,problems,treatments,tests
0,16002318-DS-17,<SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...,"{'PROBLEM': ['101', '7 pound weight loss', 'a ...","[101, 7 pound weight loss, a fever, a lower ab...","[abdominal exercises, albuterol sulfate, all, ...","[b12, bmi, calcium, physical exam]"
1,15638884-DS-4,<SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...,"{'PROBLEM': ['+', '-', '1 cm area', 'a "" cyst ...","[+, -, 1 cm area, a "" cyst "", a 2cm distal cbd...","[a bankart repair, a nicotine patch, a stent, ...","[., _, a, a ct scan, absbaso, abseos, abslymp,..."
2,12435705-DS-14,<SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...,{'PROBLEM': ['a 0. 7 x 0. 7 x 0. 7 cm simple c...,"[a 0. 7 x 0. 7 x 0. 7 cm simple cyst, a 2. 1 x...","[2, a prolonged course, ampicillin, anti -, an...","[16s rdna primer set, aa, abl, acid fast cultu..."
3,12413577-DS-4,<SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...,{'PROBLEM': ['a third - degree uterine prolaps...,"[a third - degree uterine prolapse, abnormal v...","[a stool softener, acetaminophen, admissionmis...","[hct, hgb, mch, mchc, mcv, nadr, physical exam..."
4,17967161-DS-29,<SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...,"{'PROBLEM': ['101', 'abuse', 'acute pancreatit...","[101, abuse, acute pancreatitis, angina, anter...","[a, a 3 mm x 40 mm balloon percutaneous transl...","[angap, blood, blood calcium, blood ck (, bloo..."


In [15]:
# Save processed_df as a CSV file
processed_df.to_csv("processed_df.csv", index=False)


In [11]:


# Load the saved CSV file into a new DataFrame
new_df = pd.read_csv("processed_df.csv")

# Display the new DataFrame to verify
print(new_df.head())


          note_id                                              input  \
0  16002318-DS-17  <SEX> F <SERVICE> SURGERY <ALLERGIES> Iodine /...   
1   15638884-DS-4  <SEX> M <SERVICE> MEDICINE <ALLERGIES> Augment...   
2  12435705-DS-14  <SEX> M <SERVICE> MEDICINE <ALLERGIES> ibuprof...   
3   12413577-DS-4  <SEX> F <SERVICE> OBSTETRICS/GYNECOLOGY <ALLER...   
4  17967161-DS-29  <SEX> M <SERVICE> SURGERY <ALLERGIES> lisinopr...   

                                            entities  \
0  {'PROBLEM': ['101', '7 pound weight loss', 'a ...   
1  {'PROBLEM': ['+', '-', '1 cm area', 'a " cyst ...   
2  {'PROBLEM': ['a 0. 7 x 0. 7 x 0. 7 cm simple c...   
3  {'PROBLEM': ['a third - degree uterine prolaps...   
4  {'PROBLEM': ['101', 'abuse', 'acute pancreatit...   

                                            problems  \
0  ['101', '7 pound weight loss', 'a fever', 'a l...   
1  ['+', '-', '1 cm area', 'a " cyst "', 'a 2cm d...   
2  ['a 0. 7 x 0. 7 x 0. 7 cm simple cyst', 'a 2. ...   
3  ['a

In [12]:
new_df['input'].iloc[6]



In [14]:
new_df['problems'].iloc[6]

'[\'a\', \'abdominal distention\', \'abdominal pain\', \'adenocarcinoma\', \'all\', \'anicteric sclera\', \'anxiety\', \'any other symptoms\', \'aortic dissection\', \'ascites\', \'both lower lobes\', \'breast cancer\', \'bs\', \'caking of the omentum\', \'chest pain\', \'clubbing\', \'compression atelectasis\', \'crackles\', \'cyanosis\', \'decreased breath sounds at bases\', \'decreased sounds to midback\', \'depression\', \'diarrhea\', \'difficulty getting around\', \'disease\', \'distended\', \'edema\', \'elevateda\', \'excoriations\', \'fibroid ut\', \'guarding\', \'her ascites\', \'her effusion\', \'huge " amount\', \'hyperlipidemia\', \'increase\', \'increasingly\', \'interval increase\', \'large portion\', \'lesions\', \'llq\', \'low back pain\', \'malignant cells\', \'moderate - to - large bilateral pleural effusions\', \'morbid obesity\', \'murmur\', \'nad\', \'nephrolithiasis\', \'new ascites\', \'nontender\', \'now\', \'osa\', \'ovarian cancer\', \'patent nares\', \'perimen