In [1]:
!!pip install psycopg2-binary pandas medspacy gensim sklearn matplotlib spacy scispacy plotly https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz

['Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz',
 '  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz (119.8 MB)',
 '  Preparing metadata (setup.py): started',
 "  Preparing metadata (setup.py): finished with status 'done'",
 '',
 '[notice] A new release of pip is available: 24.1.2 -> 25.0.1',
 '[notice] To update, run: python.exe -m pip install --upgrade pip']

### Loading Relevant Notes from Postgres Mimic III DB

In [2]:
import getpass
pg_pass = getpass.getpass(prompt="Postgres DB Password: ")

In [36]:
%%time
import psycopg2
import pandas as pd

# Replace with your actual database credentials
conn = psycopg2.connect(
    dbname="mimic",
    user="postgres",
    password=pg_pass,
    host="localhost"
)

query = """
SELECT ne.*
FROM mimiciii.NOTEEVENTS ne
JOIN mimiciii.DIAGNOSES_ICD di ON ne.hadm_id = di.hadm_id
WHERE di.icd9_code LIKE '346%'
"""

df_notes = pd.read_sql(query, conn)
conn.close()

print(f"Retrieved {len(df_notes)} migrane-related notes")


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Retrieved 11942 migrane-related notes
CPU times: total: 250 ms
Wall time: 2.69 s


In [10]:
# Sample a subset of the retrieved notes:
df_notes = df_notes.sample(n=5200, random_state=42)
print(f"Final df has {len(df_notes)} entries")


Final df has 5200 entries


In [11]:
for row in df_notes['text'].head():
    print(row)
    print('*****************************************')

65 yo F w/ hx of 4mm L MCA aneurysm s/p failed stent ([**3-18**]) now s/p
   left craniotomy for clipping of left MCA aneurysm w/ expressive aphasia
   starting 2 days post-op.
   PMHx:
   PMH: migraines, HTN, gerd, DM2 (diet control), hx of MI ([**2182**]), high
   cholesterol
   PSH: cholecystectomy, kidney fistula repair, hysterectomy, trigger
   finger surgery
   Aneurysm, other
   Assessment:
   A&OX1 Able to state name. When asked birthdate states name w/ jumbled
   numbers and numbers when asked where she is. Aware she has difficulty
   word finding saying, Oh I can
t say it.
   MAE no drift noted. Tongue midline no facial droop noted. When states
   name speech clear.
   C/O headache this afternoon.
   Staple D&I. old bruising around eyes.
   Difficulty this am swallowing pills. Able to tolerate thin liquids.
   Action:
   Sicu team notified of swallowing diff. Meds held and changed to IV.
   Eval ordered and completed.
   Q4hr neuro checks.
   CTA/CTP of the head ordered and c

### Using Spacy/Scispacy to extract entities

In [None]:
import scispacy
import en_ner_bc5cdr_md

nlp = en_ner_bc5cdr_md.load()

CPU times: total: 5.84 s
Wall time: 5.91 s


In [18]:
%%time

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df_notes['entities'] = df_notes['text'].apply(extract_entities)

print(df_notes[['text', 'entities']].head())

                                                    text  \
6932   65 yo F w/ hx of 4mm L MCA aneurysm s/p failed...   
8548   Chief Complaint:\n   24 Hour Events:\n   - Sed...   
11195  42 YO F with headache for 16 days. Pt was put ...   
357    [**2152-12-20**] 7:47 PM\n SKULL (AP, [**Last ...   
3753   TITLE:\n   Chief Complaint:\n   24 Hour Events...   

                                                entities  
6932   [(aneurysm, DISEASE), (aphasia, DISEASE), (mig...  
8548   [(Allergies, DISEASE), (Ciprofloxacin, CHEMICA...  
11195  [(headache, DISEASE), (sinus infection, DISEAS...  
357    [(pneumocephaly, DISEASE), (pneumocephaly, DIS...  
3753   [(Allergies, DISEASE), (Penicillins, CHEMICAL)...  
CPU times: total: 5min 34s
Wall time: 5min 39s


In [20]:
from spacy import displacy
from IPython.core.display import display, HTML

sample_notes = df_notes['text'].head(3)

for i, note in enumerate(sample_notes):
    doc = nlp(note)
    html = displacy.render(doc, style="ent", jupyter=True)
    display(HTML(html))


  from IPython.core.display import display, HTML


<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

### Word2Vec and tSNE

In [None]:
import string

def process_entities(entities):
    tokens = []
    translator = str.maketrans('', '', string.punctuation)
    for ent_text, ent_label in entities:
        token = ent_text.lower().translate(translator)
        if token and not token.isdigit():
            tokens.append(token)
    return tokens

entity_sentences = df_notes['entities'].apply(process_entities).tolist()
entity_sentences[:3]

[['aneurysm',
  'aphasia',
  'migraines',
  'htn',
  'cholesterol',
  'psh',
  'kidney fistula',
  'aneurysm',
  'headache',
  'bruising',
  'aphasia',
  'thin',
  'allergies',
  'hypertension',
  'gerd',
  'arthritis',
  'migraines',
  'coagulopathy',
  'sb',
  'sinus bradycardia',
  'o2',
  'o2 saturation',
  'o2 flow',
  'fio2',
  'sodium',
  'potassium',
  'chloride',
  'co2',
  'creatinine',
  'glucose',
  'glucose'],
 ['allergies',
  'ciprofloxacin',
  'propofol',
  'pantoprazole',
  'protonix',
  'metoprolol',
  'sinus tachycardia',
  'ng',
  'o2',
  'peep',
  'fio2',
  'fio2',
  'ctab',
  'edema',
  'cr',
  'tco2',
  'glucose',
  'etoh',
  'diazepam',
  'propofol',
  'propofol',
  'hypertensive',
  'diazepam',
  'propofol',
  'hyponatremia',
  'fen',
  'dvt',
  'stress ulcer',
  'vap'],
 ['headache',
  'sinus infection',
  'pain',
  'aneurysm',
  'pts left eye',
  'ptosis',
  'aneurysm',
  'pt',
  'pt',
  'ptosis',
  'angio']]

In [23]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    entity_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

words = list(w2v_model.wv.index_to_key)
word_vectors = w2v_model.wv[words]

In [24]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
word_vecs_2d = tsne.fit_transform(word_vectors)




In [None]:
import plotly.express as px
import pandas as pd

df_tsne = pd.DataFrame({
    'word': words,
    'x': word_vecs_2d[:, 0],
    'y': word_vecs_2d[:, 1]
})

fig = px.scatter(df_tsne, x='x', y='y',
                 hover_data=['word'],
                 title="tSNE Plot")
fig.update_traces(textposition='top center')
fig.show()


In [34]:
df_tsne_small = pd.DataFrame({
    'word': words,
    'x': word_vecs_2d[:, 0],
    'y': word_vecs_2d[:, 1]
})

df_tsne_small = df_tsne_small.sample(200)

fig = px.scatter(df_tsne_small, x='x', y='y', text='word',
                 hover_data=['word'],
                 title="Small Subset of tSNE Plot with visible labels")
fig.update_traces(textposition='top center')
fig.show()


### Extra Credit: Using medspacy to do the NER

In [None]:
import sys
sys.path.insert(0, "..")

import spacy
from spacy.tokens import Span

import medspacy
from medspacy.preprocess import PreprocessingRule, Preprocessor
from medspacy.ner import TargetRule
from medspacy.context import ConTextRule
from medspacy.section_detection import Sectionizer, SectionRule
from medspacy.postprocess import PostprocessingRule, PostprocessingPattern, postprocessing_functions
from medspacy.visualization import visualize_ent
import re

nlp = medspacy.load()


In [None]:
# I got these preprocessing rules from the medspacy github repo (They are in this example notebook: https://github.com/medspacy/medspacy/blob/master/notebooks/05-Full-Pipeline.ipynb)
preprocessor = Preprocessor(nlp.tokenizer)
nlp.tokenizer = preprocessor

preprocess_rules = [
    PreprocessingRule(
        r"\[\*\*[\d]{1,4}-[\d]{1,2}(-[\d]{1,2})?\*\*\]",
        repl="01-01-2010",
        desc="Replace MIMIC date brackets with a generic date."
    ),
    PreprocessingRule(
        r"\[\*\*[\d]{4}\*\*\]",
        repl="2010",
        desc="Replace MIMIC year brackets with a generic year."
    ),
    PreprocessingRule(
        "dx'd",
        repl="Diagnosed",
        desc="Replace abbreviation"
    ),
    PreprocessingRule(
        "tx'd",
        repl="Treated",
        desc="Replace abbreviation"
    ),
    PreprocessingRule(
        "\[\*\*[^\]]+\]",
        desc="Remove all other bracketed placeholder text from MIMIC"
    )
]
preprocessor.add(preprocess_rules)


In [None]:
Span.set_extension("icd9", default="")
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = [
    TargetRule(literal="migraine headache", category="PROBLEM", attributes={"icd9": "346"}),
]
target_matcher.add(target_rules)

context = nlp.get_pipe("medspacy_context")
context_rules = [
    ConTextRule("diagnosed in <YEAR>", "HISTORICAL", 
                pattern=[
                    {"LOWER": "diagnosed"},
                    {"LOWER": "in"},
                    {"LOWER": {"REGEX": "^[\d]{4}$"}}
                ])
]
context.add(context_rules)

In [38]:
print("Pipeline components:", nlp.pipe_names)

def process_note(text):
    doc = nlp(text)
    return doc

if len(df_notes) > 0:
    doc_sample = process_note(df_notes.iloc[0]['text'])
    visualize_ent(doc_sample)


Pipeline components: ['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context', 'medspacy_sectionizer', 'medspacy_postprocessor']
