# ICD-O Entity Resolution - version 2.4.6


In [1]:
import sys, os, time, pandas as pd

sys.path.append("/home/fernandrez/JSL/repos/spark-nlp/python")
sys.path.append("/home/fernandrez/JSL/repos/spark-nlp-internal/python")

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from sparknlp_jsl.annotator import *
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 500)

In [2]:
import sparknlp_jsl
sparknlp_jsl.version()

'2.4.7'

In [3]:
spark = sparknlp_jsl.start("####")

## Let's create a dataset with all some case studies

In [4]:
data = CoNLL().readDataset(spark, "/home/fernandrez/JSL/repos/spark-nlp-training/src/main/resources/ner-posology-data/i2b2_med7.conll")

In [23]:
data = data\
.withColumn("text_feed", F.expr("regexp_replace(lower(text),'([0-9])',' $1 ')"))\
.withColumn("length", F.expr("length(text)"))\
.withColumn("doc_id", F.monotonically_increasing_id())\
.orderBy("length", ascending=False).limit(20)

SyntaxError: unexpected character after line continuation character (<ipython-input-23-2fed4cd01ccb>, line 4)

In [6]:
from IPython.core.display import display, HTML

html_output=""
for i, d in enumerate(data.select("text").collect()):
    html_output += f'Note {i}:'
    html_output +='<div style="border:2px solid #747474; background-color: #e3e3e3; margin: 5px; padding: 10px">'
    html_output += d[0]
    html_output += '</div><br/>'

display(HTML(html_output))

In [7]:
# Notes column names

docid_col         = "doc_id"
note_col          = "text_feed"

## And let's build a SparkNLP pipeline with the following stages:
- DocumentAssembler: Entry annotator for our pipelines; it creates the data structure for the Annotation Framework
- SentenceDetector: Annotator to pragmatically separate complete sentences inside each document
- Tokenizer: Annotator to separate sentences in tokens (generally words)
- StopWordsCleaner: Annotator to remove words defined as StopWords in SparkML
- WordEmbeddings: Vectorization of word tokens, in this case using word embeddings trained from PubMed, ICD10 and other clinical resources.
- ChunkEmbeddings: Aggregates the WordEmbeddings for each NER Chunk
- BioNLP NER + NerConverter: This annotators return Chunks related to Cancer and Genetics diseases
- ChunkEntityResolver: Annotator that performs search for the KNNs, in this case trained from ICDO Histology Behavior.

In [8]:
#Language and Model Repository

models_language   = "en"
models_repository = "clinical/models"

In [9]:
# Embeddings Pretrained Model Name

embeddings_name   = "embeddings_clinical"

# Preparation and embeddings column names

doc_col           = "document"
sent_col          = "sentence"
token_col         = "token"
embeddings_col    = "embeddings"

# Usual preparation Annotators and Embeddings

docAssembler = DocumentAssembler().setInputCol(note_col).setOutputCol(doc_col)
sentenceDetector = SentenceDetector().setInputCols(doc_col).setOutputCol(sent_col)
tokenizer_chars = [",","\/"," ",".","|","@","#","%","&","\\$","\\[","\\]","\\(","\\)","\\-",";"]
tokenizer = Tokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(sent_col).setOutputCol(token_col)
embeddings = WordEmbeddingsModel.pretrained(embeddings_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col)\
    .setOutputCol(embeddings_col)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [10]:
# NER Pretrained Model Names

jsl_ner_name  = "ner_jsl"
drug_ner_name = "ner_drugs"

# NER Column Names

ner_jsl_col  = "ner_jsl"
ner_drug_col = "ner_drug"

# Annotators responsible for the Cancer Genetics Entity Recognition task

jslNer = NerDLModel.pretrained(jsl_ner_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col, embeddings_col)\
    .setOutputCol(ner_jsl_col)

drugNer = NerDLModel.pretrained(drug_ner_name, models_language, models_repository)\
    .setInputCols(sent_col, token_col, embeddings_col)\
    .setOutputCol(ner_drug_col)

ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_drugs download started this may take some time.
Approximate size to download 13.8 MB
[OK!]


In [11]:
# Chunk column names

chunk_jsl_col  = "chunk_jsl"
chunk_drug_col = "chunk_drug"

#Converter annotators transform IOB tags into full chunks (sequence set of tokens) tagged with `entity` metadata

jslConverter = NerConverter().setInputCols(sent_col, token_col, ner_jsl_col)\
    .setOutputCol(chunk_jsl_col).setWhiteList(["Diagnosis"])
drugConverter = NerConverter().setInputCols(sent_col, token_col, ner_drug_col)\
    .setOutputCol(chunk_drug_col)

In [12]:
# ChunkEmbeddings column names

chunk_embs_jsl_col  = "chunk_embs_jsl"
chunk_embs_drug_col = "chunk_embs_drug"

#ChunkEmbeddings annotators aggregate embeddings for each token in the chunk

jslChunkEmbeddings = ChunkEmbeddings()\
  .setInputCols(chunk_jsl_col, embeddings_col)\
  .setOutputCol(chunk_embs_jsl_col)
drugChunkEmbeddings = ChunkEmbeddings()\
  .setInputCols(chunk_drug_col, embeddings_col)\
  .setOutputCol(chunk_embs_drug_col)

In [13]:
# ChunkTokenizer column names

chunk_token_jsl_col  = "chunk_token_jsl"
chunk_token_drug_col = "chunk_token_drug"

# ChunkTokenizer provides extra flexibility at the time of tokenizing a given chunk

jslChunkTokenizer = ChunkTokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(chunk_jsl_col).setOutputCol(chunk_token_jsl_col)
drugChunkTokenizer = ChunkTokenizer().setSplitChars(tokenizer_chars)\
    .setInputCols(chunk_drug_col).setOutputCol(chunk_token_drug_col)

In [14]:
# Entity Resolution Pretrained Model Names

icd10cm_model_name = "chunkresolve_icd10cm_clinical"
rxnorm_model_name  = "ensembleresolve_rxnorm_clinical"

# Entity Resolution column Names

icd10cm_col = "icd10cm_resolution"
rxnorm_col  = "rxnorm_resolution"
    
icd10cmResolver = ChunkEntityResolverModel\
    .pretrained(icd10cm_model_name, models_language, models_repository)\
    .setEnableLevenshtein(True)\
    .setNeighbours(200).setAlternatives(5).setDistanceWeights([1,3,2,0,0,1])\
    .setExtramassPenalty(3).setPoolingStrategy("MAX")\
    .setInputCols(chunk_token_jsl_col, chunk_embs_jsl_col)\
    .setOutputCol(icd10cm_col)

rxnormResolver = EnsembleEntityResolverModel()\
    .pretrained(rxnorm_model_name, models_language, models_repository)\
    .setEnableLevenshtein(True)\
    .setNeighbours(200).setAlternatives(5).setDistanceWeights([1,3,2,0,0,1])\
    .setExtramassPenalty(3).setPoolingStrategy("MAX")\
    .setInputCols(chunk_token_drug_col, chunk_embs_drug_col)\
    .setOutputCol(rxnorm_col)\

chunkresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 166.3 MB
[OK!]
ensembleresolve_rxnorm_clinical download started this may take some time.
Approximate size to download 783.3 MB
[OK!]


In [15]:
pipelineFull = Pipeline().setStages([
    docAssembler, 
    sentenceDetector, 
    tokenizer, 
    embeddings, 
    jslNer,
    drugNer,
    jslConverter,
    drugConverter,
    jslChunkEmbeddings, 
    drugChunkEmbeddings,
    jslChunkTokenizer,
    drugChunkTokenizer,
    icd10cmResolver,
    rxnormResolver
])

In [16]:
pipelineModelFull = pipelineFull.fit(data)

In [17]:
output = pipelineModelFull.transform(data).cache()

## The key parts of our pipeline are the **WordEmbeddings, NerDLModel and ChunkEntityResolver**: 

### WordEmebeddings:   
Word2Vec model trained on semantically augmented datasets using information from curated Datasets in JSL Data Market.  

### EntityResolver:  
Trained on an augmented ICDO Dataset from JSL Data Market it provides histology codes resolution for the matched expressions. Other than providing the code in the "result" field it provides more metadata about the matching process:  

- all_k_results -> Sorted ResolverLabels in the top `alternatives` that match the distance `threshold`
- all_k_resolutions -> Respective ResolverNormalized strings
- all_k_distances -> Respective distance values after aggregation
- all_k_wmd_distances -> Respective WMD distance values (added if allDistancesMetadata==True)
- all_k_tfidf_distances -> Respective TFIDF Cosine distance values (added if allDistancesMetadata==True)
- all_k_jaccard_distances -> Respective Jaccard distance values (added if allDistancesMetadata==True)
- all_k_sorensen_distances -> Respective SorensenDice distance values (added if allDistancesMetadata==True)
- all_k_jaro_distances -> Respective JaroWinkler distance values (added if allDistancesMetadata==True)
- all_k_levenshtein_distances -> Respective Levenshtein distance values (added if allDistancesMetadata==True)
- all_k_confidences -> Respective normalized probabilities based in inverse distance valuesprobability (added if allDistancesMetadata==True)
- target_text -> The actual searched string
- resolved_text -> The top ResolverNormalized string
- confidence -> Top probability
- distance -> Top distance value
- sentence -> Sentence index
- chunk -> Chunk Index
- token -> Token index

In [18]:
def quick_metadata_analysis(df, doc_field, chunk_field, code_fields, dist_thres=1):
    code_res_meta = ", ".join([f"{cf}.result, {cf}.metadata" for cf in code_fields])
    expression = f"explode(arrays_zip({chunk_field}.begin, {chunk_field}.end, {chunk_field}.result, {chunk_field}.metadata, "+code_res_meta+")) as a"
    top_n_rest = [(f"a['{2*i+4}'] as {(cf.split('_')[0])}",
                   f"float(a['{2*i+5}'].distance) as {(cf.split('_')[0])}_dst",
                   f"a['{2*i+5}'].confidence as {(cf.split('_')[0])}_conf",
                    f"split(a['{2*i+5}'].all_k_resolutions,':::') as {cf.split('_')[0]+'_opts'}")
                    for i, cf in enumerate(code_fields)]
    top_n_rest_args = []
    for tr in top_n_rest:
        for t in tr:
            top_n_rest_args.append(t)
    return df.selectExpr(doc_field, expression) \
        .orderBy(docid_col, F.expr("a['0']"), F.expr("a['1']"))\
        .selectExpr(f"concat_ws('::',{doc_field},a['0'],a['1']) as coords", "a['2'] as chunk","a['3'].entity as entity", *top_n_rest_args)

In [19]:
icd10cm_analysis = \
quick_metadata_analysis(output, docid_col, chunk_jsl_col,[icd10cm_col], 1).toPandas()

In [20]:
rxnorm_analysis = \
quick_metadata_analysis(output, docid_col, chunk_drug_col,[rxnorm_col], 1).toPandas()

In [21]:
icd10cm_analysis[icd10cm_analysis.rxnorm_dst<0.75]

Unnamed: 0,coords,chunk,entity,icd10cm,icd10cm_dst,icd10cm_conf,icd10cm_opts
0,1::460::471,hypertension,Diagnosis,P292,0.7822,0.2415,"[Neonatal hypertension, Postprocedural hypertension, Portal hypertension, Renovascular hypertension, Secondary hypertension, unspecified]"
1,2::551::558,insomnia,Diagnosis,G4709,0.6428,0.2355,"[Other insomnia, Primary insomnia, Adjustment insomnia, Paradoxical insomnia, Psychophysiologic insomnia]"
2,4::72::85,"hypertension ,",Diagnosis,I973,0.697,0.2259,"[Postprocedural hypertension, Neonatal hypertension, Portal hypertension, Renovascular hypertension, Other secondary hypertension]"
3,4::117::120,copd,Diagnosis,R0902,1.3677,0.3386,"[Hypoxemia, Respiratory arrest, Chronic adenoiditis, Thromboangiitis obliterans [Buerger's disease], Postprocedural hypertension]"
4,4::124::132,gastritis,Diagnosis,K3183,1.375,0.2392,"[Achlorhydria, Celiac disease, Trichuriasis, Opisthorchiasis, Calicivirus enteritis]"
5,6::51::90,restrictive and obstructive lung disease,Diagnosis,J449,0.6463,0.3183,"[Chronic obstructive pulmonary disease, unspecified, Suberosis, Interstitial pulmonary disease, unspecified, Farmer's lung, Desquamative interstitial pneumonia]"
6,6::263::288,fixed inferolateral defect,Diagnosis,Q214,0.9813,0.2379,"[Aortopulmonary septal defect, Cardiac septal defect, acquired, Congenital diaphragmatic hernia, Atrioventricular septal defect, Ventricular septal defect]"
7,6::292::311,systolic dysfunction,Diagnosis,N522,0.8949,0.2883,"[Drug-induced erectile dysfunction, Male erectile disorder, Other testicular dysfunction, Corporo-venous occlusive erectile dysfunction, Outlet dysfunction constipation]"
8,6::468::482,aortic stenosis,Diagnosis,Q253,0.0,0.3179,"[Supravalvular aortic stenosis, Atherosclerosis of aorta, Congenital stenosis of aortic valve, Rheumatic aortic stenosis, Williams syndrome]"
9,7::122::144,coronary artery disease,Diagnosis,I2541,0.6244,0.3227,"[Coronary artery aneurysm, Chronic total occlusion of coronary artery, Coronary angioplasty status, Coronary artery dissection, Atherosclerosis of renal artery]"


In [22]:
rxnorm_analysis[rxnorm_analysis.rxnorm_dst<0.75]

Unnamed: 0,coords,chunk,entity,rxnorm,rxnorm_dst,rxnorm_conf,rxnorm_opts
0,0::0::10,pentamidine,DrugChem,7994,0.0,0.3457,"[Pentamidine, Pentamidine diisothionate, Pentamidine Isethionate, Pentamidine only product, Propamidine]"
1,0::47::57,pentamidine,DrugChem,7994,0.0,0.3457,"[Pentamidine, Pentamidine diisothionate, Pentamidine Isethionate, Pentamidine only product, Propamidine]"
2,0::116::127,voriconazole,DrugChem,121243,0.0,0.3846,"[Voriconazole, Voriconazole Powder, Posaconazole, Itraconazole, Fluconazole]"
3,0::158::166,acyclovir,DrugChem,281,0.0,0.3513,"[Acyclovir, Acyclovir Powder, Acyclovir Sodium, Ganciclovir, valaciclovir]"
4,0::197::208,cyclosporine,DrugChem,3008,0.0,0.3246,"[Cyclosporine, Cyclosporine Powder, cycloSPORINE ophthalmic, CycloSPORINE Modified, Tacrolimus]"
5,0::236::245,prednisone,DrugChem,8640,0.0,0.2489,"[Prednisone, PREDNISONE PWDR, Prednisone Intensol, Prednisone acetate, PredniSONE Anhydrous]"
6,0::375::385,simethicone,DrugChem,9796,0.0,0.3288,"[Simethicone, Simethicone Powder, Simethicone Solution, simethicone emulsion, Simethicone only product]"
7,0::413::425,nitroglycerin,DrugChem,4917,0.0,0.4393,"[Nitroglycerin, Nitroglycerin ER, Caffeine / Nitroglycerin, Isosorbide, Nicorandil]"
8,0::467::477,flunisolide,DrugChem,25120,0.0,0.2991,"[Flunisolide, Flunisolide Anhydrous, flunisolide nasal, Flunisolide Micronized, flunisolide hemihydrate]"
9,0::515::523,oxycodone,DrugChem,7804,0.0,0.3823,"[Oxycodone, oxycodone HCl, oxycodone pectinate, Oxycodone only product, Hydrocodone]"
