# ICD-O Entity Resolution - version 2.4.6


In [1]:
import sys, os, time, pandas as pd

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from sparknlp_jsl.annotator import *
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 500)

In [2]:
import sparknlp_jsl
sparknlp_jsl.version()

'2.4.7'

In [3]:
spark = sparknlp_jsl.start("####")

## Let's create a dataset with all some case studies

In [4]:
data = CoNLL().readDataset(spark, "/home/fernandrez/JSL/repos/spark-nlp-training/src/main/resources/ner-posology-data/i2b2_med7.conll")

In [5]:
data = data\
.withColumn("text_feed", F.expr("regexp_replace(lower(text),'([0-9])',' $1 ')"))\
.withColumn("length", F.expr("length(text)"))\
.withColumn("doc_id", F.monotonically_increasing_id())\
.orderBy("length", ascending=False).limit(20)

In [6]:
from IPython.core.display import display, HTML

html_output=""
for i, d in enumerate(data.select("text").collect()):
    html_output += f'Note {i}:'
    html_output +='<div style="border:2px solid #747474; background-color: #e3e3e3; margin: 5px; padding: 10px">'
    html_output += d[0]
    html_output += '</div><br/>'

display(HTML(html_output))

In [7]:
# Notes column names

docid_col         = "doc_id"
note_col          = "text_feed"

## And let's build a SparkNLP pipeline with the following stages:
- DocumentAssembler: Entry annotator for our pipelines; it creates the data structure for the Annotation Framework
- SentenceDetector: Annotator to pragmatically separate complete sentences inside each document
- Tokenizer: Annotator to separate sentences in tokens (generally words)
- StopWordsCleaner: Annotator to remove words defined as StopWords in SparkML
- WordEmbeddings: Vectorization of word tokens, in this case using word embeddings trained from PubMed, ICD10 and other clinical resources.
- ChunkEmbeddings: Aggregates the WordEmbeddings for each NER Chunk
- BioNLP NER + NerConverter: This annotators return Chunks related to Cancer and Genetics diseases
- ChunkEntityResolver: Annotator that performs search for the KNNs, in this case trained from ICDO Histology Behavior.

In [8]:
#Language and Model Repository

models_language   = "en"
models_repository = "clinical/models"

In [9]:
# Embeddings Pretrained Model Name

embeddings_name   = "embeddings_clinical"

# Preparation and embeddings column names

doc_col           = "document"
sent_col          = "sentence"
token_col         = "token"
embeddings_col    = "embeddings"

# Usual preparation Annotators and Embeddings

docAssembler = DocumentAssembler().setInputCol(note_col).setOutputCol(doc_col)
sentenceDetector = SentenceDetector().setInputCols(doc_col).setOutputCol(sent_col)
tokenizer_chars = [",","\/"," ",".","|","@","#","%","&","\\$","\\[","\\]","\\(","\\)","\\-",";"]
tokenizer = Tokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(sent_col).setOutputCol(token_col)
embeddings = WordEmbeddingsModel.pretrained(embeddings_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col)\
    .setOutputCol(embeddings_col)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [10]:
# NER Pretrained Model Names

jsl_ner_name  = "ner_jsl"
drug_ner_name = "ner_drugs"

# NER Column Names

ner_jsl_col  = "ner_jsl"
ner_drug_col = "ner_drug"

# Annotators responsible for the Cancer Genetics Entity Recognition task

jslNer = NerDLModel.pretrained(jsl_ner_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col, embeddings_col)\
    .setOutputCol(ner_jsl_col)

drugNer = NerDLModel.pretrained(drug_ner_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col, embeddings_col)\
    .setOutputCol(ner_drug_col)

ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_drugs download started this may take some time.
Approximate size to download 13.8 MB
[OK!]


In [11]:
# Chunk column names

chunk_jsl_col  = "chunk_jsl"
chunk_drug_col = "chunk_drug"

#Converter annotators transform IOB tags into full chunks (sequence set of tokens) tagged with `entity` metadata

jslConverter = NerConverter().setInputCols(sent_col, token_col, ner_jsl_col)\
    .setOutputCol(chunk_jsl_col).setWhiteList(["Diagnosis"])
drugConverter = NerConverter().setInputCols(sent_col, token_col, ner_drug_col)\
    .setOutputCol(chunk_drug_col)

In [12]:
# ChunkEmbeddings column names

chunk_embs_jsl_col  = "chunk_embs_jsl"
chunk_embs_drug_col = "chunk_embs_drug"

#ChunkEmbeddings annotators aggregate embeddings for each token in the chunk

jslChunkEmbeddings = ChunkEmbeddings()\
  .setInputCols(chunk_jsl_col, embeddings_col)\
  .setOutputCol(chunk_embs_jsl_col)
drugChunkEmbeddings = ChunkEmbeddings()\
  .setInputCols(chunk_drug_col, embeddings_col)\
  .setOutputCol(chunk_embs_drug_col)

In [13]:
# ChunkTokenizer column names

chunk_token_jsl_col  = "chunk_token_jsl"
chunk_token_drug_col = "chunk_token_drug"

# ChunkTokenizer provides extra flexibility at the time of tokenizing a given chunk

jslChunkTokenizer = ChunkTokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(chunk_jsl_col).setOutputCol(chunk_token_jsl_col)
drugChunkTokenizer = ChunkTokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(chunk_drug_col).setOutputCol(chunk_token_drug_col)

In [14]:
# Entity Resolution Pretrained Model Names

icd10cm_model_name = "chunkresolve_icd10cm_clinical"
icd10cm_model_name = "ensembleresolve_icd10cm_clinical"
rxnorm_model_name  = "ensembleresolve_rxnorm_clinical"

# Entity Resolution column Names

icd10cm_col = "icd10cm_resolution"
rxnorm_col  = "rxnorm_resolution"

#ChunkEntityResolverModel.pretrained(icd10cm_model_name, models_language, models_repository)\
icd10cmResolver = \
    EnsembleEntityResolverModel.pretrained(icd10cm_model_name, models_language, models_repository)\
    .setEnableLevenshtein(True)\
    .setNeighbours(200).setAlternatives(5).setDistanceWeights([1,3,2,0,0,1])\
    .setExtramassPenalty(3).setPoolingStrategy("MAX")\
    .setInputCols(chunk_token_jsl_col, chunk_embs_jsl_col)\
    .setOutputCol(icd10cm_col)

rxnormResolver = EnsembleEntityResolverModel()\
    .pretrained(rxnorm_model_name, models_language, models_repository)\
    .setEnableLevenshtein(True)\
    .setNeighbours(200).setAlternatives(5).setDistanceWeights([1,3,2,0,0,1])\
    .setExtramassPenalty(3).setPoolingStrategy("MAX")\
    .setInputCols(chunk_token_drug_col, chunk_embs_drug_col)\
    .setOutputCol(rxnorm_col)\

ensembleresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 345 MB
[OK!]
ensembleresolve_rxnorm_clinical download started this may take some time.
Approximate size to download 783.3 MB
[OK!]


In [15]:
pipelineFull = Pipeline().setStages([
    docAssembler, 
    sentenceDetector, 
    tokenizer, 
    embeddings, 
    jslNer,
    drugNer,
    jslConverter,
    drugConverter,
    jslChunkEmbeddings, 
    drugChunkEmbeddings,
    jslChunkTokenizer,
    drugChunkTokenizer,
    rxnormResolver
])

In [16]:
# Persisiting temporarily to keep DAG size and resource usage low (Ensmeble Resolvers are Resource Intensive)
pipelineModelFull = pipelineFull.fit(data)
output = pipelineModelFull.transform(data)
output.write.mode("overwrite").save("temp")
output = spark.read.load("temp")

In [21]:
output = icd10cmResolver.transform(output).cache()

## The last part of our pipeline are the **ChunkEntityResolvers**: 

### EntityResolver:  
Trained on an augmented ICDO Dataset from JSL Data Market it provides histology codes resolution for the matched expressions. Other than providing the code in the "result" field it provides more metadata about the matching process:  

- all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
- all_k_resolutions -> Respective ResolverNormalized strings
- all_k_confidences -> Respective normalized probabilities based in inverse distance valuesprobability
- all_k_distances -> Respective distance values after aggregation
- all_k_wmd_distances -> Respective WMD distance values (added if allDistancesMetadata==True)
- all_k_tfidf_distances -> Respective TFIDF Cosine distance values (added if allDistancesMetadata==True)
- all_k_jaccard_distances -> Respective Jaccard distance values (added if allDistancesMetadata==True)
- all_k_sorensen_distances -> Respective SorensenDice distance values (added if allDistancesMetadata==True)
- all_k_jaro_distances -> Respective JaroWinkler distance values (added if allDistancesMetadata==True)
- all_k_levenshtein_distances -> Respective Levenshtein distance values (added if allDistancesMetadata==True)
- target_text -> The actual searched string
- resolved_text -> The top ResolverNormalized string
- confidence -> Top probability
- distance -> Top distance value
- sentence -> Sentence index
- chunk -> Chunk Index
- token -> Token index

In [27]:
def quick_metadata_analysis(df, doc_field, chunk_field, code_fields):
    code_res_meta = ", ".join([f"{cf}.metadata" for cf in code_fields])
    expression = f"explode(arrays_zip({chunk_field}.begin, {chunk_field}.end, {chunk_field}.result, {chunk_field}.metadata, "+code_res_meta+")) as a"
    top_n_rest = [(f"float(a['{i+4}'].confidence) as {(cf.split('_')[0])}_conf",
                    f"arrays_zip(split(a['{i+4}'].all_k_results,':::'),split(a['{i+4}'].all_k_resolutions,':::')) as {cf.split('_')[0]+'_opts'}")
                    for i, cf in enumerate(code_fields)]
    top_n_rest_args = []
    for tr in top_n_rest:
        for t in tr:
            top_n_rest_args.append(t)
    return df.selectExpr(doc_field, expression) \
        .orderBy(docid_col, F.expr("a['0']"), F.expr("a['1']"))\
        .selectExpr(f"concat_ws('::',{doc_field},a['0'],a['1']) as coords", "a['2'] as chunk","a['3'].entity as entity", *top_n_rest_args)

In [29]:
icd10cm_analysis = \
quick_metadata_analysis(output, docid_col, chunk_jsl_col,[icd10cm_col]).toPandas()

In [30]:
rxnorm_analysis = \
quick_metadata_analysis(output, docid_col, chunk_drug_col,[rxnorm_col]).toPandas()

In [34]:
icd10cm_analysis[icd10cm_analysis.icd10cm_conf>0.5]

Unnamed: 0,coords,chunk,entity,icd10cm_conf,icd10cm_opts
0,1419::122::144,coronary artery disease,Diagnosis,0.9835,"[(I2510, Atherosclerotic heart disease of native coronary artery without angina pectoris), (I739, Peripheral vascular disease, unspecified), (I2542, Coronary artery dissection), (I2541, Coronary artery aneurysm), (I2582, Chronic total occlusion o..."
1,17179871651::72::85,"hypertension ,",Diagnosis,0.9663,"[(I701, Atherosclerosis of renal artery), (I270, Primary pulmonary hypertension), (I2720, Pulmonary hypertension, unspecified), (I158, Other secondary hypertension), (I2729, Other secondary pulmonary hypertension)]"
3,17179871651::124::132,gastritis,Diagnosis,0.9995,"[(K2970, Gastritis, unspecified, without bleeding), (K2900, Acute gastritis without bleeding), (K2920, Alcoholic gastritis without bleeding), (K2960, Other gastritis without bleeding), (K5281, Eosinophilic gastritis or gastroenteritis)]"
4,17179871866::84::102,dilated left atrium,Diagnosis,0.9961,"[(Q242, Cor triatriatum), (Q173, Other misshapen ear), (Q105, Congenital stenosis and stricture of lacrimal duct), (Q201, Double outlet right ventricle), (Q226, Hypoplastic right heart syndrome)]"
6,17179871866::467::487,pericardial effusions,Diagnosis,0.7889,"[(I313, Pericardial effusion (noninflammatory)), (I314, Cardiac tamponade), (I318, Other specified diseases of pericardium), (I898, Other specified noninfective disorders of lymphatic vessels and lymph nodes), (I301, Infective pericarditis)]"
7,25769803781::46::58,breast cancer,Diagnosis,0.9871,"[(C50919, Malignant neoplasm of unspecified site of unspecified female breast), (C61, Malignant neoplasm of prostate), (C23, Malignant neoplasm of gallbladder), (C569, Malignant neoplasm of unspecified ovary), (C541, Malignant neoplasm of endomet..."
9,25769803781::151::164,hyperlipidemia,Diagnosis,0.9998,"[(E785, Hyperlipidemia, unspecified), (E7849, Other hyperlipidemia), (E781, Pure hyperglyceridemia), (E7801, Familial hypercholesterolemia), (E782, Mixed hyperlipidemia)]"
10,25769803781::177::200,urinary tract infections,Diagnosis,0.5388,"[(N390, Urinary tract infection, site not specified), (N139, Obstructive and reflux uropathy, unspecified), (N320, Bladder-neck obstruction), (N209, Urinary calculus, unspecified), (N219, Calculus of lower urinary tract, unspecified)]"
12,25769804562::70::89,myelogenous leukemia,Diagnosis,0.5312,"[(C9500, Acute leukemia of unspecified cell type not having achieved remission), (C9510, Chronic leukemia of unspecified cell type not having achieved remission), (C92Z0, Other myeloid leukemia not having achieved remission), (C9210, Chronic myel..."
13,34359739832::551::558,insomnia,Diagnosis,0.9999,"[(G4700, Insomnia, unspecified), (G4709, Other insomnia), (G4719, Other hypersomnia), (G4710, Hypersomnia, unspecified), (G4453, Primary thunderclap headache)]"


In [33]:
rxnorm_analysis[rxnorm_analysis.rxnorm_conf>0.5]

Unnamed: 0,coords,chunk,entity,rxnorm_conf,rxnorm_opts
0,1419::89::99,cholesterol,DrugChem,0.9995,"[(2438, Cholesterol), (1366924, cholesterol sulfate), (10808, TRIGLYCERIDES), (6433, VLDL cholesterol), (1368200, cholesterol oleate)]"
1,17179871651::404::411,atropine,DrugChem,0.8091,"[(1223, Atropine), (92995, Isopto Atropine), (7971, pempidine), (153971, Atropine sulphate), (26839, Hexamethonium)]"
2,17179871651::439::446,dopamine,DrugChem,0.9996,"[(3628, Dopamine), (62342, Dopamine agonists), (82010, dopamine HCl), (7512, Norepinephrine), (2146, Catecholamine)]"
3,25769803781::107::115,tamoxifen,DrugChem,0.7936,"[(10324, Tamoxifen), (606421, GenRx Tamoxifen), (40137, Tamoxifen Citrate), (72143, Keoxifene), (72965, Letrozole)]"
4,25769803877::278::286,adenosine,DrugChem,0.9995,"[(296, Adenosine), (318, Adenosine triphosphate), (359140, Adenosine Powder), (655027, ADENOSINE MONOPHOSPHATE PWDR), (309, Adenosine Monophosphate)]"
7,34359739832::0::6,tylenol,DrugChem,0.5824,"[(202433, Tylenol), (1192485, Children's Tylenol), (353059, Infant's Tylenol), (692659, Tylenol GoTabs), (1192491, Children's Tylenol Meltaway)]"
8,34359739832::67::75,acyclovir,DrugChem,0.9995,"[(281, Acyclovir), (10803, Trifluridine), (73645, valaciclovir), (33562, Foscarnet), (83171, Cidofovir)]"
9,34359739832::105::113,acyclovir,DrugChem,0.9995,"[(281, Acyclovir), (10803, Trifluridine), (73645, valaciclovir), (33562, Foscarnet), (83171, Cidofovir)]"
10,34359739832::276::285,folic acid,DrugChem,0.7348,"[(4511, Folic acid), (225860, Cantassium folic acid), (237787, Folic Acid Powder), (1431568, Folic acid analogues), (1151, Ascorbic acid)]"
11,34359739832::307::315,lorazepam,DrugChem,0.7309,"[(6470, Lorazepam), (672652, LORazepam Novaplus), (74667, Zaleplon), (381029, Diazepam RecTube), (10767, Triazolam)]"
