In [6]:
#Step 1: Prerequisites:
#Python 3.10.*
#pip install torch, jupyter, transformers, ipywidgets, pandas, re, medspacy

#Step 2: Import packages
import re
import pandas as pd
import torch
from transformers import AdamW,BertForSequenceClassification,BertTokenizer
import medspacy
from medspacy.target_matcher import TargetRule
from medspacy.visualization import visualize_ent


In [2]:
#Step 3: Read Notes and Annotations Data
df_notes = pd.read_csv("../data/NOTEEVENTS.csv")
df_annotations = pd.read_csv('../data/NIHMS1767978-supplement-MIMIC_SBDH.csv')

In [44]:
#Step 4: Filter out data to only include annotated notes and their SBDH values

# Columns will be the annotation table columns + TEXT
columns = df_annotations.columns.tolist()
columns.append("TEXT")
annotated_rows = []

#Iterate over annotations of MIMIC-III notes and store concatenation of input and outputs
for index, row in df_annotations.iterrows():
    #get row ID and note text for each entry
    rowId = row["row_id"]
    note = df_notes.loc[df_notes["ROW_ID"] == rowId]
    note = note.iloc[0] #access the first (only) row A.K.A the discharge detail note itself
    note = str(note["TEXT"])

    #Combine the annotations data with the note
    row_list = row.tolist()
    row_list.append(note)

    #Add to the list of annotated notes
    annotated_rows.append(row_list)

#Store the list of rows as a dataframe with the described columns, and save
df_annotated_notes = pd.DataFrame(annotated_rows, columns=columns)
df_annotated_notes.to_csv('../data/ANNOTATEDNOTES.csv')
    

In [52]:
#Step 5 Pre-processing

#Step 5a Select the desired parts of the clinical note (Social History)
# Use the medspacy library
nlp = medspacy.load()

# Add the sectionizer object to our pipeline, as this is the feature we will use
sectionizer = nlp.add_pipe("medspacy_sectionizer")

# Validate sectionizer was added to the pipelines
nlp.pipe_names

# Extract the social history from each note
# NOTE: This takes a long time to run
note_social_histories = []

for index, row in df_annotated_notes.iterrows():
    note = str(row["TEXT"])
    print('start nlp')
    doc = nlp(note)
    print('end nlp, start section')
    for section in doc._.sections:
        if (section.category == 'social_history'):
            social_history = section.body_span
            social_history_text = doc[social_history[0]:social_history[1]]
            note_social_histories.append(social_history_text)
            print('end section')
            break

print(note_social_histories[:5])

start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp
end nlp, start section
end section
start nlp


KeyboardInterrupt: 

In [47]:
print(len(note_social_histories))

589


In [None]:
# Dictionaries to convert output to SBDH presence string
#Substance (drug, alcohol, tobacco) classifications
sbdh_substance = {
    0: 'None',
    1: 'Present',
    2: 'Past',
    3: 'Never',
    4: 'Unsure'
}

#Economics (employed) classifications
sbdh_econ_env = {
    0: 'None',
    1: 'True',
    2: 'False',
}

#Community or Education classifications
sbdh_community_ed = {
    0: 'False',
    1: 'True',
}

In [5]:
# Drop the "CHARTTIME" and "STORETIME" columns
df = df.drop(["CHARTTIME", "STORETIME", "CGID", "CHARTDATE"], axis=1)

# Drop rows where 'ISERROR' is equal to 1
df = df[df['ISERROR'] != 1]

# Drop the "ISERROR" column
df = df.drop(["ISERROR"], axis=1)

  df = pd.read_csv("../data/NOTEEVENTS.csv")


[5, 0, 0, 0, 0, 0, 0, 1, 0, 'Admission Date:  [**2190-5-16**]     Discharge Date:  [**2190-5-22**]\n\nDate of Birth:   [**2139-4-22**]     Sex:  F\n\nService:  CARDIOTHORACIC\n\nHISTORY OF PRESENT ILLNESS:  This 51 year-old female was\nadmitted to an outside hospital with chest pain and ruled in\nfor myocardial infarction.  She was transferred here for a\ncardiac catheterization.\n\nPAST MEDICAL HISTORY:  Hypertension, fibromyalgia,\nhypothyroidism, NASH and noninsulin dependent diabetes.\n\nPAST SURGICAL HISTORY:  Hysterectomy and cholecystectomy.\n\nSOCIAL HISTORY:  She smokes a pack per day.\n\nMEDICATIONS ON ADMISSION:  Hydrochlorothiazide, Alprazolam,\nUrsodiol and Levoxyl.\n\nShe was hospitalized with Aggrastat, nitroglycerin and\nheparin as she ruled in for myocardial infarction.\n\nALLERGIES:  No known drug allergies.\n\nCardiac catheterization showed left anterior descending\ncoronary artery diagonal 80% lesion, circumflex 90% lesion\nand 90% lesion of the right coronary arter

In [None]:

noteText = str(note["TEXT"])
category = str(note["CATEGORY"])

if category.lower() != "discharge summary":
    print(category)

#get start, end indices and sbdh name from keywords table
keywords_data = keywords.loc[keywords["row_id"] == rowId]
keyword_entries = keywords_data[["start","end","sbdh"]]

#Iterate over each keyword entry
for sindex, srow in keyword_entries.iterrows():
    #Get the sbdh referenced, as well as its classified value, and indices of keyword
    sbdh = srow["sbdh"]

    if "community" in sbdh:
        sbdh = "sdoh_community_present" #the keywords table does not specify absence or presence, so assume presence
    
    sbdh_value = row[sbdh]
    
    sbdh_term = ""
    start, end = srow["start"], srow["end"]
    
    #Map the classified value to corresponding string (Present, Past, etc.)
    if "behavior" in sbdh:
        sbdh_term = sbdh_substance[sbdh_value]
    elif "community" in sbdh or "education" in sbdh:
        sbdh_term = sbdh_community_ed[sbdh_value]
    else:
        sbdh_term = sbdh_econ_env[sbdh_value]

    #Print each sbdh found, and the keyword
    #print(rowId, sbdh, sbdh_term, noteText[start:end])

    #Print context surrounding keyword
    print('Context: ', noteText[start-20:end+20])

In [5]:
#
# pip install transformers
! pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118







Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.16.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pillow!=8.3.*,>=5.3.0
  Downloading https://download.pytorch.org/whl/Pillow-9.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pillow, torchvision, torchaudio
Successfully installed

In [6]:
#Initialize tokenizer and model from pretrained Bio_ClinicalBert
from transformers import BertForSequenceClassification, BertTokenizer
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

#Put the model into training mode
model.train() 

ImportError: 
BertForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFBertForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [None]:
# Initialize Adam optimizer 

optimizer = AdamW(model.parameters(), lr=1e-5)