In [32]:
import pandas as pd
df_sample = pd.read_csv("sample_summary.csv")
# Display the first few rows
print(df_sample.head())

# Check DataFrame info
print(df_sample.info())

                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                              output  
0  Subjective:\n- Symptoms: Lower back pain, radi...  
1  Subjective:\n- Presenting with dry cough for 1...  
2  Subjective:\n- No known allergies to medicatio...  
3  Subjective:\n- Fever and dry cough started 4 d...  
4  Subjective:\n- Presenting with chest pain for ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   100 non-null    object
 1   output  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB
None


In [33]:
print(df_sample['output'].iloc[9])

Subjective:
- Presenting with cough that started yesterday, wet cough with yellowish-green sputum that smells bad and leaves a bad taste in mouth
- Shortness of breath started within last 12 hours, has been worsening
- Shortness of breath worse with smoking cigarettes and going out to get mail, had to sit down due to inability to get mail which is unusual
- Feels heart has been beating a little faster
- Felt a little nauseous with the bad taste in mouth
- No headache, runny nose, congestion, chest pain, abdominal pain, nausea, vomiting, diarrhea, constipation, or sore throat

PMHx:
- Hypertension
- GERD - still gets heartburn and acid in mouth at times despite PPI, worse with certain foods
- Hyperlipidemia
- Anxiety - no panic attacks
- Left knee replacement a few years ago

Medications: ACE inhibitor, statin, pantoprazole

SocHx:
- Retired
- Drinks 4-6 beers daily, had more than usual yesterday and friends reported he passed out drunk
- Smokes 0.5 PPD cigarettes, down from 1 PPD previ

In [34]:
import pandas as pd
import uuid

# Assuming df_sample is your DataFrame
# Generate a new column 'id' with a UUID for each row
df_sample['id'] = df_sample.apply(lambda x: str(uuid.uuid4()), axis=1)


In [4]:
df_sample.head(5)

Unnamed: 0,input,output,id
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi...",d3f29f4c-d851-4cfb-8845-5d6aa68c49fb
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...,37627204-129c-4588-a852-900458f4de08
2,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...,47c797bf-332d-4809-a259-44c2e064adf4
3,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...,bef04717-df21-4be7-aa7e-d7c1f80ea9f0
4,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...,11348c49-156c-48a8-8f54-5a480d0aa8da


In [9]:
!pip install -q huggingface_hub
from huggingface_hub import login
import os

# Use token from environment variable (safer)
login(os.getenv("HF_TOKEN"))


In [11]:
!pip install -q transformers

In [35]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd
import torch
from tqdm import tqdm

class SoapNERProcessor:
    def __init__(self, confidence_threshold: float = 0.75, max_token_length: int = 512, overlap: int = 50):
        """
        Initialize the SoapNERProcessor with a specified confidence threshold.
        Uses the samrawal/bert-base-uncased_clinical-ner model for NER.
        """
        print("\nInitializing SOAP NER Processor...")
        self.model_name = "samrawal/bert-base-uncased_clinical-ner"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        self.confidence_threshold = confidence_threshold
        self.max_token_length = max_token_length
        self.overlap = overlap

        device = 0 if torch.cuda.is_available() else -1
        self.ner_pipeline = pipeline(
            "ner",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple",
            device=device
        )
        print("NER pipeline initialized successfully")

    def preprocess_text(self, text: str):
        return str(text).strip() if text else ""

    def split_text_into_chunks(self, text: str):
        """
        Split text into manageable chunks for NER processing to avoid exceeding the model's token limit.
        """
        chunks = []
        tokens = self.tokenizer.tokenize(text)
        max_chunk_size = self.max_token_length - self.overlap
        for i in range(0, len(tokens), max_chunk_size):
            chunk_tokens = tokens[i:i + self.max_token_length]
            chunks.append(self.tokenizer.convert_tokens_to_string(chunk_tokens))
        return chunks

    def extract_entities(self, text: str):
        """
        Extract entities from the clinical summary using the NER pipeline.
        """
        text = self.preprocess_text(text)
        if not text:
            return {"PROBLEM": [], "TREATMENT": [], "TEST": []}

        chunks = self.split_text_into_chunks(text)
        entities = {"PROBLEM": [], "TREATMENT": [], "TEST": []}
        for chunk in chunks:
            ner_results = self.ner_pipeline(chunk)
            for entity in ner_results:
                if entity['score'] > self.confidence_threshold:
                    entity_type = entity['entity_group'].upper()
                    entities[entity_type].append(entity['word'].replace("##", ""))

        return {k: list(set(v)) for k, v in entities.items()}

if __name__ == "__main__":
    print("\n=== Starting NER Processing for SOAP Data ===")
    processed_df = process_dataframe(
        df=df_sample,  
        text_column='output',  
        confidence_threshold=0.75  # Adjust as necessary
    )

    print("\nResults preview:")
    print(processed_df[['problems', 'treatments', 'tests']].head())



=== Starting NER Processing for SOAP Data ===

Initializing SOAP NER Processor...


Device set to use cuda:0


NER pipeline initialized successfully

Results preview:
                                            problems  \
0  [radiculopathy, the pain, weakness in the legs...   
1  [known sick contacts, flu - like illness, ning...   
2                                  [known allergies]   
3  [hee, sp, blood, symptoms, w, um, viral infect...   
4  [significant stress, chronic conditions, curre...   

                                          treatments  \
0  [anti - inflammatory medications, treatments, ...   
1  [appendect, omy, infection control measures, o...   
2                                      [medications]   
3                                     [multivitamin]   
4                [advil, tu, ms, tylenol, treatment]   

                                               tests  
0  [differential diagnoses, x - rays of the lower...  
1                         [sw, covid, swab, testing]  
2                                                 []  
3                         [covid, swab, temperatur

In [37]:
processed_df = processed_df.drop(['lifestyle', 'family_history', 'medications'], axis=1)


In [38]:
processed_df.head(40)

Unnamed: 0,input,output,id,entities,problems,treatments,tests
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi...",39a26c55-f710-4272-8609-2a725ef6d068,"{'PROBLEM': ['radiculopathy', 'the pain', 'wea...","[radiculopathy, the pain, weakness in the legs...","[anti - inflammatory medications, treatments, ...","[differential diagnoses, x - rays of the lower..."
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...,b1448089-3c41-423a-9737-0ed25e0e99c8,"{'PROBLEM': ['known sick contacts', 'flu - lik...","[known sick contacts, flu - like illness, ning...","[appendect, omy, infection control measures, o...","[sw, covid, swab, testing]"
2,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...,78cf9b57-3a1f-41df-ba55-af3bbe843228,"{'PROBLEM': ['known allergies'], 'TREATMENT': ...",[known allergies],[medications],[]
3,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...,44be7957-be23-4cc9-a5e2-cb7139dc2079,"{'PROBLEM': ['hee', 'sp', 'blood', 'symptoms',...","[hee, sp, blood, symptoms, w, um, viral infect...",[multivitamin],"[covid, swab, temperature]"
4,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...,3324cd95-1729-4b0b-8916-dba010ac811e,"{'PROBLEM': ['significant stress', 'chronic co...","[significant stress, chronic conditions, curre...","[advil, tu, ms, tylenol, treatment]","[physical examination, s signs, vital, assessm..."
5,Hi there! What brings you in today? Guest_fami...,Subjective:\n- Concern that the baby may have ...,78fbdd2e-ed8e-45f9-a8c1-ae3c82861f83,"{'PROBLEM': [], 'TREATMENT': ['the ant bait', ...",[],"[the ant bait, borax, ant bait, medications]",[]
6,"Are you allergic to anything? No, I am not all...",Subjective:\n- No allergies reported (includin...,2aebdcda-bb2f-4967-8426-9e216c02cd14,"{'PROBLEM': ['allergies', 'drug allergies'], '...","[allergies, drug allergies]",[],[]
7,"What brought you in today? Sure, I'm I'm just ...",Subjective:\n- Chest pain\n- Duration: Started...,d1f2e65d-dcdc-41a2-8208-0e0130b72489,"{'PROBLEM': ['lightheadedness', 'lesterol prob...","[lightheadedness, lesterol problems, cancers, ...","[recent hospitalisations, medications, prior s...","[physical exam, assessment, vital]"
8,"What brings you in? Yeah, I'm just coming in w...",Subjective:\n- 4 yo M presents with right ear ...,14d8c634-8731-4d2f-ba4d-d9703ea2ff3e,"{'PROBLEM': ['ha', 'red', 'systemic symptoms',...","[ha, red, systemic symptoms, w, ope, ty, pcn a...","[abx, ce, amoxicillin, furoxime]","[ear exam, vital]"
9,"What brings you in today? Yeah, so I've had th...",Subjective:\n- Presenting with cough that star...,2baf3abb-bc1b-419f-8a45-0003d6f914c4,"{'PROBLEM': ['previous knee injury', 'wet coug...","[previous knee injury, wet cough, pati, as, he...","[left knee replacement, antibiotics, statin, a...","[ups, age - appropriate screenings, ray, vital..."


In [39]:
# Saving the processed DataFrame to a CSV file
csv_file_path = 'processed_output.csv'  # Specify your desired file path
processed_df.to_csv(csv_file_path, index=False)  # Set index=False if you don't want to include row indices in the file
print(f"Data saved to {csv_file_path}")

Data saved to processed_output.csv
