# Overview
In this notebook, we'll look at some specific clinical NLP tasks and you'll write rules to extract concepts for these texts.

## TODO
Will make this more interactive and explanatory.

In [None]:
import spacy
import medspacy

from medspacy.ner import TargetRule
from medspacy.context import ConTextItem
from medspacy.visualization import visualize_ent, visualize_dep

- **Option #1**: Load the default model and add target rules to extract concepts
- **Option #2**: Load the pretrained model

In [None]:
nlp = medspacy.load()
# nlp = medspacy.load("en_info_3700_i2b2_2012")

In [None]:
nlp.pipe_names

# I. Surgical site infection
Extract evidence of SSIs and body locations.

In [None]:
texts = [
    "Fluid collection is seen in the abdomen.",
    "There is no evidence of fluid collection.",
    "PURPOSE OF EXAM: Rule out abscess.",
    "Hematomas are seen around in the right lower quadrant."
]

In [None]:
target_matcher = nlp.get_pipe("target_matcher")

In [None]:
target_rules = [
    TargetRule("fluid collection", "SSI"),
    TargetRule("hematoma", "SSI", pattern=[{"LOWER": {"REGEX": "hematoma"}}]),
    TargetRule("abscess", "SSI"),
    TargetRule("abdomen", "BODY_LOC", pattern=[{"LOWER": {"REGEX": "abd(omen)?"}}]),
    TargetRule("<LEFT/RIGHT> <UPPER/LOWER> quadrant", "BODY_LOC",
              pattern=[
                  {"LOWER": {"IN": ["left", "right"]}},
                  {"LOWER": {"IN": ["upper", "lower"]}},
                  {"LOWER": "quadrant"}
              ]),
]

In [None]:
target_matcher.add(target_rules)

In [None]:
docs = list(nlp.pipe(texts))

In [None]:
visualize_ent(docs[1])

In [None]:
visualize_dep(docs[1])

# II. COVID-19
https://openreview.net/pdf?id=ZQ_HvBxcdCv

In [None]:
nlp = medspacy.load()

In [None]:
texts = [
    "Patient admitted to hospital for respiratory failure secondary to COVID-19.",
    "The patient reports that they have been diagnosed with COVID-19",
    "Requested that patient be screened for novel coronavirus via telephone",
    "Lab Results: SARS-COV-2 DETECTED",
    "Patient does not have COVID-19",
    "This encounter is done over the telephone secondary to COVID-19 precautions.",
    
]

In [None]:
target_matcher = nlp.get_pipe("target_matcher")

In [None]:
target_rules = [
    TargetRule("COVID-19", "COVID-19"),
    TargetRule("SARS-COV-2", "COVID-19"),
    TargetRule("novel coronavirus", "COVID-19"),
]

In [None]:
target_matcher.add(target_rules)

In [None]:
context = nlp.get_pipe("context")

In [None]:
item_data = [
    ConTextItem("admitted to hospital for", "POSITIVE_EXISTENCE", rule="FORWARD"),
    ConTextItem("diagnosed with", "POSITIVE_EXISTENCE", rule="FORWARD"),
    ConTextItem("secondary to", "POSITIVE_EXISTENCE", rule="FORWARD"),
    ConTextItem("screened for", "UNCERTAIN", rule="FORWARD"),
    ConTextItem("DETECTED", "POSITIVE_EXISTENCE", rule="BACKWARD"),
    ConTextItem("precautions", "HYPOTHETICAL", rule="BIDIRECTIONAL"),
]

In [None]:
context.add(item_data)

In [None]:
docs = list(nlp.pipe(texts))

In [None]:
idx = 1

In [None]:
visualize_ent(docs[idx])

In [None]:
visualize_dep(docs[idx])

# III. Process full documents from MIMIC

In [None]:
nlp = medspacy.load("en_info_3700_i2b2_2012")

In [None]:
nlp.pipe_names

In [None]:
ner = nlp.get_pipe("ner")

In [None]:
ner.labels

In [None]:
import pymysql
import getpass

In [None]:
conn = pymysql.connect(host="35.233.174.193",port=3306,
                           user="jovyan",passwd=getpass.getpass("Enter password for MIMIC2 database"),
                           db='mimic2')

In [None]:
import pandas as pd

In [None]:
query = """

SELECT subject_id, text
FROM noteevents
WHERE category = 'DISCHARGE_SUMMARY'
LIMIT 10;

"""
df = pd.read_sql(query, conn)

In [None]:
df.head()

In [None]:
%%time
docs = list(nlp.pipe(df["text"]))

In [None]:
doc = docs[0]

In [None]:
visualize_ent(doc)

## Analyze results

In [None]:
ents_data = []
for doc in docs:
    for ent in doc.ents:
        d = {
            "ent": ent,
            "text": ent.lower_,
            "label": ent.label_,
            "is_negated": ent._.is_negated,
            "is_family": ent._.is_family,
            "section_title": ent._.section_title
        }
        ents_data.append(d)

In [None]:
ents_df = pd.DataFrame(ents_data)

In [None]:
ents_df.head()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import seaborn as sns
sns.set()

In [None]:
ents_df.groupby("label").size().plot.bar()

What problems occur most often in the PMH?

In [None]:
pmh = ents_df[(ents_df["section_title"] == "past_medical_history")
             &
             (ents_df["label"] == "PROBLEM")]

In [None]:
ax = pmh["text"].value_counts().iloc[:10].plot.barh()
ax.invert_yaxis()

Which problems occur most in family history?

In [None]:
fh = ents_df[(ents_df["is_family"] == True) | (ents_df["section_title"] == "family_history")]
fh = fh[fh["label"] == "PROBLEM"]

In [None]:
fh.head()

In [None]:
ax = fh["text"].value_counts().iloc[:10].plot.barh()
ax.invert_yaxis()