In [1]:
#Step 1: Prerequisites:
#Python 3.10.*
#pip install torch, jupyter, transformers, ipywidgets, pandas, re, medspacy

#Step 2: Import packages
import re
import string
import math
import pandas as pd
import numpy as np
import medspacy
from medspacy.visualization import visualize_ent


In [2]:
#Step 3: Read Notes and Annotations Data
df_notes = pd.read_csv("../data-old/NOTEEVENTS.csv")
df_annotations = pd.read_csv('../data-old/MIMIC-SBDH-main/MIMIC-SBDH.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data-old/NOTEEVENTS.csv'

In [6]:
#Step 4: Filter out data to only include annotated notes and their SBDH values
print(df_notes["TEXT"].tolist()[901])

# Columns will be the annotation table columns + TEXT
columns = df_annotations.columns.tolist()
columns.append("TEXT")
annotated_rows = []

#Iterate over annotations of MIMIC-III notes and store concatenation of input and outputs
for index, row in df_annotations.iterrows():
    #get row ID and note text for each entry
    rowId = row["row_id"]
    note = df_notes.loc[df_notes["ROW_ID"] == rowId]
    note = note.iloc[0] #access the first (only) row A.K.A the discharge detail note itself
    note = str(note["TEXT"])

    #Combine the annotations data with the note
    row_list = row.tolist()
    row_list.append(note)

    #Add to the list of annotated notes
    annotated_rows.append(row_list)

#Store the list of rows as a dataframe with the described columns, and save
df_annotated_notes = pd.DataFrame(annotated_rows, columns=columns)
df_annotated_notes.to_csv('../data/ANNOTATEDNOTES.csv')
    

Admission Date:  [**2180-6-12**]       Discharge Date:  [**2180-7-5**]


Service:  VASCULAR SURGERY

CHIEF COMPLAINT:  Ischemic right fifth toe ulcer.

HISTORY OF PRESENT ILLNESS:  This is a 79-year-old white
female with coronary artery disease, status post myocardial
infarction with coronary artery bypass grafting in [**2170**],
myocardial infarction with congestive heart failure in
[**2179-11-9**], with diabetes, end-stage renal disease on
hemodialysis, status post left above-knee amputation in [**2175**],
who complained of an eight-month history of right forefoot
ulceration.  In spite of treatment, the patient's left fifth
toe ulceration has not healed.

Over the previous week prior to admission, the patient noted
changes in the color of her right toes.  She denied rest
pain.  She complained of prior symptoms of right lower
extremity claudication, although currently she is wheelchair
bound.  She has a left lower extremity prosthesis, which she
does not use.

The patient was seen in 

KeyboardInterrupt: 

In [3]:
#Step 5 Setup Medspacy

# Use the medspacy library
nlp = medspacy.load()
sectionizer = nlp.add_pipe("medspacy_sectionizer")
df_annotated_notes = pd.read_csv("../data/ANNOTATEDNOTES.csv")

In [4]:
#Step 5 Pre-processing

#Step 5a Select the desired parts of the clinical note (Social History)

# Extract the social history and past medical history from each note
# NOTE: This takes a long time to run
note_social_histories = []
current_note_count = 0

for index, row in df_annotated_notes.iterrows():
    social_found = False
    past_med_found = False
    note = str(row["TEXT"])
    doc = nlp(note)
    note_text = ""

    relevant_sections = (section for section in doc._.sections if section.category == 'social_history' or section.category == 'past_medical_history')

    for section in relevant_sections:
        if (section.category == 'social_history'):
            social_found = True
            social_history = section.body_span
            social_history_text = 'SOCIAL HISTORY:\n' + str(doc[social_history[0]:social_history[1]])
            note_text += social_history_text + ' '
        elif (section.category == 'past_medical_history' and social_found and not past_med_found): #need the past medical history because for example, row 339, it is inside social history
            past_med_found = True
            past_history = section.body_span
            past_history_text = str(doc[past_history[0]:past_history[1]])
            note_text += past_history_text + ' '
        else:
            if (social_found): break

    current_note_count += 1
    
    if (social_found):
        note_social_histories.append(note_text)
        print("Processed:", current_note_count, note_text)    #log progress
    else:
        print(f"No social history found in note {current_note_count}")
   



Processed: 1 SOCIAL HISTORY:
 She smokes a pack per day.

 
Processed: 2 SOCIAL HISTORY:

Social history is significant for the absence of current tobacco

use; she smoked [**12-15**] PPD from age 18 to age 60. There is no
history of alcohol abuse; she occasionally has wine. Uses a
walker; no recent falls.


 
Processed: 3 SOCIAL HISTORY:

Right inguinal hernia repair in childhood
Cervical discectomy 3 years ago
Umbilical hernia repair [**2137**]

 SOCIAL HISTORY:

SHx: Retired schoolteacher, now substitutes. Lives with wife in

[**Location (un) 1439**]. Has a 27 yo son and a 25 yo daughter. [**Name (NI) **] past or present
smoking hx, no EtOH

 
Processed: 4 SOCIAL HISTORY:

- Tobacco: smokes 1-1.5ppd x 30yrs
- Alcohol: none per mother
- [**Name (NI) 3264**]: hx of cocaine and marijuana abuse, none for past
20yrs
- Divorced 20yrs ago, has 2 adult children, lives with mother in
[**Name (NI) 3494**]


 Catatonia and worsening rhonchi. Suspect
pneumonia.
 
Processed: 5 SOCIAL HISTORY:

M

In [19]:
# Step 6: Save the results of the extraction as SOCIAL_TEXT column
# Add our shortened notes as column "SOCIAL_TEXT" into a new csv that expands on "ANNOTATEDNOTES.csv"
df_annotated_notes["SOCIAL_TEXT"] = note_social_histories
df_annotated_notes.to_csv('../data/clean/ANNOTATEDNOTESSOCIALS-NEW.csv')



In [10]:
# Clean social history text
df = pd.read_csv('../data/clean/ANNOTATEDNOTESSOCIALS-NEW.csv', index_col=[0,1])
socials = df["SOCIAL_TEXT"].tolist()

# remove all characters except alphanumeric, spaces, and punctuation
cleaned_socials = [re.sub(r'[^a-zA-Z0-9+\s' + re.escape(string.punctuation) + ']+', '', str(social)).lower().strip() for social in socials]
cleaned_socials = [re.sub(r'[\n]+', ' ', str(social)) for social in cleaned_socials]

nan
64
nan
38
2379


In [74]:
# Save cleaned social histories to csv
df = df.drop(['TEXT', 'SOCIAL_TEXT'], axis=1)
df["text"] = cleaned_socials
df.head(10)
df.to_csv('../data/clean/PREPROCESSED-NOTES-NEW.csv')


Unnamed: 0_level_0,Unnamed: 1_level_0,sdoh_community_present,sdoh_community_absent,sdoh_education,sdoh_economics,sdoh_environment,behavior_alcohol,behavior_tobacco,behavior_drug,text,overflow
Unnamed: 0_level_1,row_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,5,0,0,0,0,0,0,1,0,she smokes a pack per day.,0
1,42,0,0,0,0,0,0,2,0,social history is significant for the absence ...,0
2,136,1,0,0,2,1,3,4,0,right inguinal hernia repair in childhood cerv...,0
3,442,1,1,0,0,1,3,1,2,- tobacco: smokes 1-1.5ppd x 30yrs - alcohol: ...,0
4,328,1,0,0,2,1,3,3,3,"married with three children, born in [**2184**...",0
5,762,0,0,0,0,1,3,1,3,she lives alone at the [**location (un) 4398**...,0
6,564,0,0,0,1,0,1,2,0,the patient quit smoking 20 years ago; ethanol...,0
7,281,1,0,0,0,1,3,2,0,"former smoker, no etoh. lives with his wife.",0
8,410,0,0,0,2,1,3,2,3,the patient has a sixty-pack-year history of t...,0
9,416,0,0,0,2,1,0,0,3,"formerly worked in insurance, not working curr...",0


In [None]:
# Dictionaries to convert output to SBDH presence string
#Substance (drug, alcohol, tobacco) classifications
sbdh_substance = {
    0: 'None',
    1: 'Present',
    2: 'Past',
    3: 'Never',
    4: 'Unsure'
}

#Economics (employed) classifications
sbdh_econ_env = {
    0: 'None',
    1: 'True',
    2: 'False',
}

#Community or Education classifications
sbdh_community_ed = {
    0: 'False',
    1: 'True',
}