# ICD10CM-RxNorm Entity Resolution - version 2.4.6


In [1]:
import sys, os, time, pandas as pd

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from sparknlp_jsl.annotator import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 500)

In [2]:
import sparknlp_jsl
sparknlp_jsl.version()

'2.4.6'

In [3]:
secret = "####"
spark = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "48G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-2.4.6.jar")\
        .getOrCreate()

## Let's create a dataset with some case studies

In [4]:
notes = [
'Pentamidine 300 mg IV q . 36 hours , Pentamidine nasal wash 60 mg per 6 ml of sterile water q.d . , voriconazole 200 mg p.o . b.i.d . , acyclovir 400 mg p.o . b.i.d . , cyclosporine 50 mg p.o . b.i.d . , prednisone 60 mg p.o . q.d . , GCSF 480 mcg IV q.d . , Epogen 40,000 units subcu q . week , Protonix 40 mg q.d . , Simethicone 80 mg p.o . q . 8 , nitroglycerin paste 1 " ; q . 4 h . p.r.n . , flunisolide nasal inhaler , 2 puffs q . 8 , OxyCodone 10-15 mg p.o . q . 6 p.r.n . , Sudafed 30 mg q . 6 p.o . p.r.n . , Fluconazole 2% cream b.i.d . to erythematous skin lesions , Ditropan 5 mg p.o . b.i.d . , Tylenol 650 mg p.o . q . 4 h . p.r.n . , Ambien 5-10 mg p.o . q . h.s . p.r.n . , Neurontin 100 mg q . a.m . , 200 mg q . p.m . , Aquaphor cream b.i.d . p.r.n . , Lotrimin 1% cream b.i.d . to feet , Dulcolax 5-10 mg p.o . q.d . p.r.n . , Phoslo 667 mg p.o . t.i.d . , Peridex 0.12% , 15 ml p.o . b.i.d . mouthwash , Benadryl 25-50 mg q . 4-6 h . p.r.n . pruritus , Sarna cream q.d . p.r.n . pruritus , Nystatin 5 ml p.o . q.i.d . swish and !',
'Albuterol nebulizers 2.5 mg q.4h . and Atrovent nebulizers 0.5 mg q.4h . , please alternate albuterol and Atrovent ; Rocaltrol 0.25 mcg per NG tube q.d .; calcium carbonate 1250 mg per NG tube q.i.d .; vitamin B12 1000 mcg IM q . month , next dose is due Nov 18 ; diltiazem 60 mg per NG tube t.i.d .; ferrous sulfate 300 mg per NG t.i.d .; Haldol 5 mg IV q.h.s .; hydralazine 10 mg IV q.6h . p.r.n . hypertension ; lisinopril 10 mg per NG tube q.d .; Ativan 1 mg per NG tube q.h.s .; Lopressor 25 mg per NG tube t.i.d .; Zantac 150 mg per NG tube b.i.d .; multivitamin 10 ml per NG tube q.d .; Macrodantin 100 mg per NG tube q.i.d . x 10 days beginning on 11/3/00 .',
'Tylenol 650 mg p.o . q . 4-6h p.r.n . headache or pain ; acyclovir 400 mg p.o . t.i.d .; acyclovir topical t.i.d . to be applied to lesion on corner of mouth ; Peridex 15 ml p.o . b.i.d .; Mycelex 1 troche p.o . t.i.d .; g-csf 404 mcg subcu q.d .; folic acid 1 mg p.o . q.d .; lorazepam 1-2 mg p.o . q . 4-6h p.r.n . nausea and vomiting ; Miracle Cream topical q.d . p.r.n . perianal irritation ; Eucerin Cream topical b.i.d .; Zantac 150 mg p.o . b.i.d .; Restoril 15-30 mg p.o . q . h.s . p.r.n . insomnia ; multivitamin 1 tablet p.o . q.d .; viscous lidocaine 15 ml p.o . q . 3h can be applied to corner of mouth or lips p.r.n . pain control .',
'The patient\'s incisions sternal and right leg were clean and healing well , normal sinus rhythm at 70-80 , with blood pressure 98-110/60 and patient was doing well , recovering , ambulating , tolerating regular diet and last hematocrit prior to discharge was 39% with a BUN and creatinine of 15 and 1.0 , prothrombin time level of 13.8 , chest X-ray prior to discharge showed small bilateral effusions with mild cardiomegaly and subsegmental atelectasis bibasilar and electrocardiogram showed normal sinus rhythm with left atrial enlargement and no acute ischemic changes on electrocardiogram .',
'This is an 82 - year-old male with a history of prior tobacco use , hypertension , chronic renal insufficiency , COPD , gastritis , and TIA who initially presented to Braintree with a non-ST elevation MI and Guaiac positive stools , transferred to St . Margaret\'s Center for Women & Infants for cardiac catheterization with PTCA to mid LAD lesion complicated by hypotension and bradycardia requiring Atropine , IV fluids and transient dopamine possibly secondary to vagal reaction , subsequently transferred to CCU for close monitoring , hemodynamically stable at the time of admission to the CCU .',
'O2 95% on 3L NC mixed Quinn 82% genrl : in nad , resting comfortably heent : perrla ( 4->3 mm ) bilaterally , blind in right visual field , eomi , dry mm , ? thrush neck : no bruits cv : rrr , no m/r/g , faint s1/s2 pulm : cta bilaterally abd : midline scar ( from urostomy ) , nabs , soft , appears distended but patient denies , ostomy RLQ c/d/i , NT to palpation back : right flank urostomy tube , c/d/i , nt to palpation extr : no Gardner neuro : a , ox3 , wiggles toes bilaterally , unable to lift LE , 06-12 grip bilaterally w/ UE , decrease sensation to soft touch in left',
'Is notable for an inferior myocardial infarction , restrictive and obstructive lung disease with an FEV1 of . 9 and FVC of 1.34 and a moderate at best response to bronchodilators , and a negative sestamibi scan in May , 1999 apart from a severe fixed inferolateral defect , systolic dysfunction with recent echocardiography revealing an LVID of 62 mm . and ejection fraction of 28 percent , moderate mitral regurgitation and mild-to-moderate aortic stenosis with a peak gradient of 33 and a mean gradient of 19 and a valve area of 1.4 cm . squared .',
'This is a 47 - year-old male with a past medical history of type 2 diabetes , high cholesterol , hypertension , and coronary artery disease , status post percutaneous transluminal coronary angioplasty times two , who presented with acute coronary syndrome refractory to medical treatment and TNK , now status post Angio-Jet percutaneous transluminal coronary angioplasty and stent of proximal left anterior descending artery and percutaneous transluminal coronary angioplasty of first diagonal with intra-aortic balloon pump placement .',
'Clinical progression of skin and sinus infection on maximal antimicrobial therapy continued , with emergence on November 20 of a new right-sided ptosis in association with a left homonymous hemianopsia , and fleeting confusion while febrile , prompting head MRI which revealed a large 5 x 2 x 4.3 cm region in the right occipital lobe of hemorrhage and edema , with dural and , likely , leptomeningeal enhancement in association with small foci in the right cerebellum and pons , concerning for early lesions of similar type .',
'The patient had an echocardiogram on day two of admission , which revealed a mildly dilated left atrium , mild symmetric LVH , normal LV cavity size , mild region LV systolic dysfunction , arresting regional wall motion abnormality including focal apical hypokinesis , a normal right ventricular chamber size and free wall motion , a moderately dilated aortic root , a mildly dilated ascending aorta , normal aortic valve leaflet , normal mitral valve leaflet and no pericardial effusions .',
'The patient is a 65-year-old man with refractory CLL , status post non-myeloblative stem cell transplant approximately nine months prior to admission , and status post prolonged recent Retelk County Medical Center stay for Acanthamoeba infection of skin and sinuses , complicated by ARS due to medication toxicity , as well as GVHD and recent CMV infection , readmitted for new fever , increasing creatinine , hepatomegaly and fluid surge spacing , in the setting of hyponatremia .',
'Tylenol 650 mg p.o . q.4h . p.r.n . , Benadryl 25 mg p.o . q.h.s . p.r.n . , Colace 100 mg p.o . q.i.d . , Nortriptyline 25 mg p.o . q.h.s . , Simvastatin 10 mg p.o . q.h.s . , Metamucil one packet p.o . b.i.d . p.r.n . , Neurontin 300 mg p.o . t.i.d . , Levsinex 0.375 mg p.o . q.12h . , Lisinopril / hydrochlorothiazide 20/25 mg p.o . q.d . , hydrocortisone topical ointment to affected areas , MS Contin 30 mg p.o . b.i.d . , MSIR 15 to 30 mg p.o . q.4h . p.r.n . pain .',
'Aspirin 325 q.d . ; albuterol nebs 2.5 mg q . 4h ; Colace 100 mg b.i.d . ; heparin 5,000 units subcu b.i.d . ; Synthroid 200 mcg q.d . ; Ocean Spray 2 sprays q . i.d . ; simvastatin 10 mg q . h.s . ; Flovent 220 mcg 2 puffs b.i.d . ; Zantac 150 b.i.d . ; nystatin ointment to the gluteal fold b.i.d . ; Lisinopril 20 mg q.d . ; Mestinon controlled release 180 q . h.s . ; Mestinon 30 mg q . 4h while awake ; prednisone 60 mg p.o . q . IM ; Atrovent nebs 0.5 mg q . i.d .',
'An echocardiogram was obtained on 4-26 which showed concentric left ventricular hypertrophy with normal _____ left ventricular function , severe right ventricular dilatation with septal hypokinesis and flattening with a question of right ventricular apical clot raised with mild aortic stenosis , severe tricuspid regurgitation and increased pulmonary artery pressure of approximately 70 millimeters , consistent with fairly severe pulmonary hypertension .',
'1 ) CV ( R ) finished amio IV load then started on po , agressive lytes ; although interrogation showed >100 episodes of VT ( as / x ) , pt prefers med therapy as opposed to ablation ( I ) enzymes mildly elevated but not actively ischemic ; lipids , ASA , statin , BB ; Adenosine thal 1/4 and echo 1/4 to look for signs of ischemia as active cause for VT ( P ) JVP at angle of jaw 1/4 -- > giving 20 Lasix ; dig level 1/4 1.3 -- > 1/2 dose as on Amio',
'sodium 141 , potassium 3.5 , chloride 107 , bicarbonates 23.8 , BUN 23 , creatinine 1.1 , glucose 165 , PO2 377 , PCO2 32 , PH 7.50 , asomus 298 , toxic screen negative , white blood cell count 11.1 , hematocrit 39.6 , platelet count 137 , prothrombin time 25.2 , INR 4.3 , partial thromboplastin time 34.7 , urinalysis 1+ albumin , 0-5 high link caths , cervical spine negative , pelvis negative , lumbar spine ; negative , thoracic spine negative .',
]

In [5]:
from IPython.core.display import display, HTML

html_output=""
for i, d in enumerate(notes):
    html_output += f'Note {i}:'
    html_output +='<div style="border:2px solid #747474; background-color: #e3e3e3; margin: 5px; padding: 10px">'
    html_output += d
    html_output += '</div><br/>'

display(HTML(html_output))

In [6]:
# Notes column names

docid_col         = "doc_id"
note_col          = "text_feed"

In [7]:
data = spark.createDataFrame([n for n in enumerate(notes)]).toDF(docid_col, note_col)

## And let's build a SparkNLP pipeline with the following stages:
- DocumentAssembler: Entry annotator for our pipelines; it creates the data structure for the Annotation Framework
- SentenceDetector: Annotator to pragmatically separate complete sentences inside each document
- Tokenizer: Annotator to separate sentences in tokens (generally words)
- StopWordsCleaner: Annotator to remove words defined as StopWords in SparkML
- WordEmbeddings: Vectorization of word tokens, in this case using word embeddings trained from PubMed, ICD10 and other clinical resources.
- ChunkEmbeddings: Aggregates the WordEmbeddings for each NER Chunk
- BioNLP NER + NerConverter: This annotators return Chunks related to Cancer and Genetics diseases
- ChunkEntityResolver: Annotator that performs search for the KNNs, in this case trained from ICDO Histology Behavior.

In [8]:
#Language and Model Repository

models_language   = "en"
models_repository = "clinical/models"

In [9]:
# Embeddings Pretrained Model Name

embeddings_name   = "embeddings_clinical"

# Preparation and embeddings column names

doc_col           = "document"
sent_col          = "sentence"
token_col         = "token"
embeddings_col    = "embeddings"

# Usual preparation Annotators and Embeddings

docAssembler = DocumentAssembler().setInputCol(note_col).setOutputCol(doc_col)
sentenceDetector = SentenceDetector().setInputCols(doc_col).setOutputCol(sent_col)
tokenizer_chars = [",","\/"," ",".","|","@","#","%","&","\\$","\\[","\\]","\\(","\\)","\\-",";"]
tokenizer = Tokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(sent_col).setOutputCol(token_col)
embeddings = WordEmbeddingsModel.pretrained(embeddings_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col)\
    .setOutputCol(embeddings_col)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [10]:
# NER Pretrained Model Names

jsl_ner_name  = "ner_jsl"
drug_ner_name = "ner_drugs"

# NER Column Names

ner_jsl_col  = "ner_jsl"
ner_drug_col = "ner_drug"

# Annotators responsible for the Cancer Genetics Entity Recognition task

jslNer = NerDLModel.pretrained(jsl_ner_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col, embeddings_col)\
    .setOutputCol(ner_jsl_col)

drugNer = NerDLModel.pretrained(drug_ner_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col, embeddings_col)\
    .setOutputCol(ner_drug_col)

ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_drugs download started this may take some time.
Approximate size to download 13.8 MB
[OK!]


In [11]:
# Chunk column names

chunk_jsl_col  = "chunk_jsl"
chunk_drug_col = "chunk_drug"

#Converter annotators transform IOB tags into full chunks (sequence set of tokens) tagged with `entity` metadata

jslConverter = NerConverter().setInputCols(sent_col, token_col, ner_jsl_col)\
    .setOutputCol(chunk_jsl_col).setWhiteList(["Diagnosis"])
drugConverter = NerConverter().setInputCols(sent_col, token_col, ner_drug_col)\
    .setOutputCol(chunk_drug_col)

In [12]:
# ChunkEmbeddings column names

chunk_embs_jsl_col  = "chunk_embs_jsl"
chunk_embs_drug_col = "chunk_embs_drug"

#ChunkEmbeddings annotators aggregate embeddings for each token in the chunk

jslChunkEmbeddings = ChunkEmbeddings()\
  .setInputCols(chunk_jsl_col, embeddings_col)\
  .setOutputCol(chunk_embs_jsl_col)
drugChunkEmbeddings = ChunkEmbeddings()\
  .setInputCols(chunk_drug_col, embeddings_col)\
  .setOutputCol(chunk_embs_drug_col)

In [13]:
# ChunkTokenizer column names

chunk_token_jsl_col  = "chunk_token_jsl"
chunk_token_drug_col = "chunk_token_drug"

# ChunkTokenizer provides extra flexibility at the time of tokenizing a given chunk

jslChunkTokenizer = ChunkTokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(chunk_jsl_col).setOutputCol(chunk_token_jsl_col)
drugChunkTokenizer = ChunkTokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(chunk_drug_col).setOutputCol(chunk_token_drug_col)

In [14]:
# Entity Resolution Pretrained Model Names

icd10cm_model_name = "chunkresolve_icd10cm_clinical"
rxnorm_model_name  = "ensembleresolve_rxnorm_clinical"

# Entity Resolution column Names

icd10cm_col = "icd10cm_resolution"
rxnorm_col  = "rxnorm_resolution"

icd10cmResolver = ChunkEntityResolverModel.pretrained(icd10cm_model_name, models_language, models_repository)\
    .setEnableLevenshtein(True)\
    .setNeighbours(200).setAlternatives(5).setDistanceWeights([1,3,2,0,0,1])\
    .setInputCols(chunk_token_jsl_col, chunk_embs_jsl_col)\
    .setOutputCol(icd10cm_col)

rxnormResolver = EnsembleEntityResolverModel()\
    .pretrained(rxnorm_model_name, models_language, models_repository)\
    .setEnableLevenshtein(True)\
    .setNeighbours(200).setAlternatives(5).setDistanceWeights([1,3,2,0,0,1])\
    .setInputCols(chunk_token_drug_col, chunk_embs_drug_col)\
    .setOutputCol(rxnorm_col)\

chunkresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 166.3 MB
[OK!]
ensembleresolve_rxnorm_clinical download started this may take some time.
Approximate size to download 783.3 MB
[OK!]


In [15]:
pipelineFull = Pipeline().setStages([
    docAssembler, 
    sentenceDetector, 
    tokenizer, 
    embeddings, 
    jslNer,
    drugNer,
    jslConverter,
    drugConverter,
    jslChunkEmbeddings, 
    drugChunkEmbeddings,
    jslChunkTokenizer,
    drugChunkTokenizer,
    rxnormResolver
])

In [16]:
# Persisiting temporarily to keep DAG size and resource usage low (Ensmeble Resolvers are Resource Intensive)
pipelineModelFull = pipelineFull.fit(data)
output = pipelineModelFull.transform(data)
output.write.mode("overwrite").save("temp")
output = spark.read.load("temp")

In [17]:
output = icd10cmResolver.transform(output).cache()

## The last part of our pipeline are the **ChunkEntityResolvers**: 

### EntityResolver:  
Trained on an augmented ICDO Dataset from JSL Data Market it provides histology codes resolution for the matched expressions. Other than providing the code in the "result" field it provides more metadata about the matching process:  

- all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
- all_k_resolutions -> Respective ResolverNormalized strings
- all_k_confidences -> Respective normalized probabilities based in inverse distance valuesprobability
- all_k_distances -> Respective distance values after aggregation
- target_text -> The actual searched string
- resolved_text -> The top ResolverNormalized string
- confidence -> Top probability
- distance -> Top distance value
- sentence -> Sentence index
- chunk -> Chunk Index
- token -> Token index

In [18]:
def quick_metadata_analysis(df, doc_field, chunk_field, code_fields):
    code_res_meta = ", ".join([f"{cf}.metadata" for cf in code_fields])
    expression = f"explode(arrays_zip({chunk_field}.begin, {chunk_field}.end, {chunk_field}.result, {chunk_field}.metadata, "+code_res_meta+")) as a"
    top_n_rest = [(f"float(a['{i+4}'].confidence) as {(cf.split('_')[0])}_conf",
                    f"arrays_zip(split(a['{i+4}'].all_k_results,':::'),split(a['{i+4}'].all_k_resolutions,':::')) as {cf.split('_')[0]+'_opts'}")
                    for i, cf in enumerate(code_fields)]
    top_n_rest_args = []
    for tr in top_n_rest:
        for t in tr:
            top_n_rest_args.append(t)
    return df.selectExpr(doc_field, expression) \
        .orderBy(docid_col, F.expr("a['0']"), F.expr("a['1']"))\
        .selectExpr(f"concat_ws('::',{doc_field},a['0'],a['1']) as coords", "a['2'] as chunk","a['3'].entity as entity", *top_n_rest_args)

In [19]:
icd10cm_analysis = \
quick_metadata_analysis(output, docid_col, chunk_jsl_col,[icd10cm_col]).toPandas()

In [20]:
rxnorm_analysis = \
quick_metadata_analysis(output, docid_col, chunk_drug_col,[rxnorm_col]).toPandas()

In [21]:
icd10cm_analysis[icd10cm_analysis.icd10cm_conf>0.25]

Unnamed: 0,coords,chunk,entity,icd10cm_conf,icd10cm_opts
12,10::343::355,CMV infection,Diagnosis,0.3047,"[(B1081, Human herpesvirus 6 infection), (B72, Dracunculiasis), (A925, Zika virus disease), (B20, Human immunodeficiency virus [HIV] disease), (B59, Pneumocystosis)]"
13,10::467::478,hyponatremia,Diagnosis,0.489,"[(E871, Hypo-osmolality and hyponatremia), (E876, Hypokalemia), (E861, Hypovolemia), (E875, Hyperkalemia), (E8351, Hypocalcemia)]"


In [22]:
rxnorm_analysis[rxnorm_analysis.rxnorm_conf>0.5]

Unnamed: 0,coords,chunk,entity,rxnorm_conf,rxnorm_opts
77,15::44::55,bicarbonates,DrugChem,1.0,"[(1527, BICARBONATES), (975127, bicarbonate), (975127, BICARBONATE), (8597, kalium jodatum), (8588, kalium)]"
