![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings_JSL/Healthcare/24.Improved_Entity_Resolvers_in_SparkNLP_with_sBert.ipynb)

# 24. Improved Entity Resolvers in Spark NLP with sBert

In [None]:
# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
! pip install -q johnsnowlabs 

In [None]:
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()

In [None]:
from johnsnowlabs import *

# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
nlp.install()

In [None]:
from johnsnowlabs import *

# Automatically load license data and start a session with all jars user has access to
spark = nlp.start()

<h1>!!! Warning !!!</h1>

**If you get an error related to Java port not found 55, it is probably because that the Colab memory cannot handle the model and the Spark session died. In that case, try on a larger machine or restart the kernel at the top and then come back here and rerun.**

## Utility functions

In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', 0)


def get_codes (lp, text, vocab='icd10cm_code', hcc=False, aux_label=False):
    
    full_light_result = lp.fullAnnotate(text)

    chunks = []
    codes = []
    begin = []
    end = []
    resolutions=[]
    all_distances =[]
    all_codes=[]
    all_cosines = []
    all_k_aux_labels=[]

    for chunk, code in zip(full_light_result[0]['ner_chunk'], full_light_result[0][vocab]):
            
        begin.append(chunk.begin)
        end.append(chunk.end)
        chunks.append(chunk.result)
        codes.append(code.result) 
        all_codes.append(code.metadata['all_k_results'].split(':::'))
        resolutions.append(code.metadata['all_k_resolutions'].split(':::'))
        all_distances.append(code.metadata['all_k_distances'].split(':::'))
        all_cosines.append(code.metadata['all_k_cosine_distances'].split(':::'))
        
        if hcc:
            try:
                all_k_aux_labels.append(code.metadata['all_k_aux_labels'].split(':::'))
            except:
                all_k_aux_labels.append([])
                
        elif aux_label:
            try:
                all_k_aux_labels.append(code.metadata['all_k_aux_labels'].split(':::'))
            except:
                all_k_aux_labels.append([])
        else:
            all_k_aux_labels.append([])

    df = pd.DataFrame({'chunks':chunks, 'begin': begin, 'end':end, 'code':codes,'all_codes':all_codes, 
                       'resolutions':resolutions, 'all_k_aux_labels':all_k_aux_labels,'all_distances':all_cosines})
    
    if hcc:

        df['billable'] = df['all_k_aux_labels'].apply(lambda x: [i.split('||')[0] for i in x])
        df['hcc_status'] = df['all_k_aux_labels'].apply(lambda x: [i.split('||')[1] for i in x])
        df['hcc_score'] = df['all_k_aux_labels'].apply(lambda x: [i.split('||')[2] for i in x])
    
    elif aux_label:

        df['gt'] = df['all_k_aux_labels'].apply(lambda x: [i.split('|')[0] for i in x])
        df['concept'] = df['all_k_aux_labels'].apply(lambda x: [i.split('|')[1] for i in x])
        df['aux'] = df['all_k_aux_labels'].apply(lambda x: [i.split('|')[2] for i in x])

    df = df.drop(['all_k_aux_labels'], axis=1)
    
    return df

## ICD10CM pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
    .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
    .setInputCols(["ner_chunk"])\
    .setOutputCol("sbert_embeddings")\
    .setCaseSensitive(False)
    
icd10_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented","en", "clinical/models") \
    .setInputCols(["ner_chunk", "sbert_embeddings"]) \
    .setOutputCol("icd10cm_code")\
    .setDistanceFunction("EUCLIDEAN")

icd_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        icd10_resolver])

icd_lp = nlp.LightPipeline(icd_pipelineModel)


sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icd10cm_augmented download started this may take some time.
[OK!]


In [None]:
text = 'bladder cancer'

%time get_codes (icd_lp, text, vocab='icd10cm_code')

CPU times: user 73.4 ms, sys: 8.51 ms, total: 81.9 ms
Wall time: 6.05 s


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,bladder cancer,0,13,C679,"[C679, Z126, D090, D494, C7911]","[bladder cancer, suspected bladder cancer, cancer in situ of urinary bladder, tumor of bladder neck, malignant tumour of bladder neck]","[0.0000, 0.0904, 0.0978, 0.1080, 0.1281]"


In [None]:
text = 'polydipsia'

%time get_codes (icd_lp, text, vocab='icd10cm_code')

CPU times: user 19.3 ms, sys: 3.26 ms, total: 22.6 ms
Wall time: 1.09 s


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,polydipsia,0,9,R631,"[R631, F6389, E232, F639, O40, G475, M7989, R632, R061, H538, I442, G479, G478, G471, Q308, I459, R002, R0681, G4754, R068, M353]","[polydipsia, psychogenic polydipsia, primary polydipsia, psychogenic polydipsia (disorder), polyhydramnios, parasomnia, polyalgia, polyphagia, biphasic stridor, oscillopsia, parasystole (disorder), dyssomnia, parasomnia (disorder), hypersomnia (disorder), polyrrhinia, palpitations - irregular, intermittent palpitations, vagal apnea, secondary parasomnia, vagal apnoea, polymyalgia]","[0.0000, 0.0455, 0.0635, 0.0786, 0.1474, 0.1520, 0.1553, 0.1573, 0.1641, 0.1675, 0.1648, 0.1600, 0.1662, 0.1667, 0.1727, 0.1702, 0.1687, 0.1672, 0.1730, 0.1700, 0.1729]"


## ICD10CM-HCC pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
hcc_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented_billable_hcc","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("icd10cm_hcc_code")\
      .setDistanceFunction("EUCLIDEAN")

hcc_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        hcc_resolver])

hcc_lp = nlp.LightPipeline(hcc_pipelineModel)

sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icd10cm_augmented_billable_hcc download started this may take some time.
[OK!]


In [None]:
text = 'bladder cancer'

%time get_codes (hcc_lp, text, vocab='icd10cm_hcc_code', hcc=True)

CPU times: user 25.8 ms, sys: 9.94 ms, total: 35.7 ms
Wall time: 1.03 s


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances,billable,hcc_status,hcc_score
0,bladder cancer,0,13,C679,"[C679, Z126, D090, D494, C7911]","[bladder cancer, suspected bladder cancer, cancer in situ of urinary bladder, tumor of bladder neck, malignant tumour of bladder neck]","[0.0000, 0.0904, 0.0978, 0.1080, 0.1281]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 1]","[11, 0, 0, 0, 8]"


In [None]:
text = 'polydipsia'

%time get_codes (hcc_lp, text, vocab='icd10cm_hcc_code', hcc=True)

CPU times: user 17.2 ms, sys: 2.98 ms, total: 20.2 ms
Wall time: 994 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances,billable,hcc_status,hcc_score
0,polydipsia,0,9,R631,"[R631, F6389, E232, F639, O40, G475, M7989, R632, R061, H538, I442, G479, G478, G471, Q308, I459, R002, R0681, G4754, R068, M353]","[polydipsia, psychogenic polydipsia, primary polydipsia, psychogenic polydipsia (disorder), polyhydramnios, parasomnia, polyalgia, polyphagia, biphasic stridor, oscillopsia, parasystole (disorder), dyssomnia, parasomnia (disorder), hypersomnia (disorder), polyrrhinia, palpitations - irregular, intermittent palpitations, vagal apnea, secondary parasomnia, vagal apnoea, polymyalgia]","[0.0000, 0.0455, 0.0635, 0.0786, 0.1474, 0.1520, 0.1553, 0.1573, 0.1641, 0.1675, 0.1648, 0.1600, 0.1662, 0.1667, 0.1727, 0.1702, 0.1687, 0.1672, 0.1730, 0.1700, 0.1729]","[1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1]","[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[0, nan, 23, nan, 0, 0, 0, 0, 0, 0, 96, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40]"


## RxNorm pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
rxnorm_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_rxnorm_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("rxnorm_code")\
      .setDistanceFunction("EUCLIDEAN")

rxnorm_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        rxnorm_resolver])

rxnorm_lp = nlp.LightPipeline(rxnorm_pipelineModel)


sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_rxnorm_augmented download started this may take some time.
[OK!]


In [None]:
text = 'metformin 100 mg'

%time get_codes (rxnorm_lp, text, vocab='rxnorm_code')

CPU times: user 17.6 ms, sys: 6.55 ms, total: 24.1 ms
Wall time: 1.09 s


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,metformin 100 mg,0,15,2200518,"[2200518, 861024, 141916, 334738, 332848, 104494, 199955, 333262, 439563, 429178, 103910, 429408, 450523, 1744000, 401938, 402346, 1726496, 485246, 198758, 316350, 213092]","[metformin hydrochloride 100 MG/ML Extended Release Suspension, metformin hydrochloride 100 MG/ML, fenofibrate 100 MG Oral Capsule, fenofibrate 100 MG, ciprofibrate 100 MG, ciprofibrate 100 MG Oral Tablet, rutin 100 MG Oral Tablet, rutin 100 MG, fendiline 100 MG, fendiline 100 MG Oral Tablet, carbamazepine 100 MG Oral Tablet [Epimaz], perazine 100 MG Oral Tablet, perazine 100 MG, emtricitabine 100 MG, miglustat 100 MG Oral Capsule, miglustat 100 MG, azacitidine 100 MG, azacitidine 100 MG Injection, niacin 100 MG Oral Capsule, niacin 100 MG, dolasetron 100 MG Oral Tablet [Anzemet]]","[0.0658, 0.0658, 0.0684, 0.0684, 0.0697, 0.0697, 0.0765, 0.0765, 0.0775, 0.0775, 0.0780, 0.0804, 0.0804, 0.0818, 0.0811, 0.0811, 0.0812, 0.0812, 0.0811, 0.0811, 0.0836]"


In [None]:
text = 'Coumadin 5 mg'

%time get_codes (rxnorm_lp, text, vocab='rxnorm_code')

CPU times: user 19.1 ms, sys: 7.57 ms, total: 26.7 ms
Wall time: 975 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,Coumadin 5 mg,0,12,855333,"[855333, 432467, 438740, 153692, 352120, 1036890, 104363, 201269, 351399, 668467, 668694, 200795, 153618, 108781, 2475858, 2475857, 905173, 429252, 450946, 1091499, 246155, 439105, 1191685, 206956, 207641]","[warfarin sodium 5 MG [Coumadin], coumarin 5 MG Oral Tablet, coumarin 5 MG, oxybutynin chloride 5 MG Oral Tablet [Urimin], nitisinone 5 MG Oral Capsule [Orfadin], protriptyline hydrochloride 5 MG [Concordin], terazosin 5 MG Oral Tablet [Hytrin], metolazone 5 MG Oral Tablet [Metenix], prednisone 5 MG Oral Tablet [Prednicot], loratadine 5 MG [Claritin], bisacodyl 5 MG [Veracolate], bendroflumethiazide 5 MG Oral Tablet [Urizid], oxybutynin chloride 5 MG Oral Tablet [Contimin], nifedipine 5 MG Oral Capsule [Calanif], vericiguat 5 MG Oral Tablet, vericiguat 5 MG, protriptyline hydrochloride 5 MG [Vivactil], pridinol 5 MG Oral Tablet, pridinol 5 MG, methylphenidate hydrochloride 5 MG [Ritalin], propinox 5 MG Oral Tablet, propinox 5 MG, prednisolone 5 MG [Prednistab], prednisone 5 MG Oral Tablet [Prednicen-M], prochlorperazine 5 MG Rectal Suppository [Compazine]]","[0.0000, 0.0287, 0.0287, 0.0479, 0.0518, 0.0525, 0.0578, 0.0585, 0.0622, 0.0643, 0.0636, 0.0634, 0.0647, 0.0630, 0.0647, 0.0647, 0.0662, 0.0666, 0.0666, 0.0670, 0.0684, 0.0684, 0.0692, 0.0703, 0.0698]"


## NDC pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
ndc_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_ndc", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("ndc_code")\
      .setDistanceFunction("EUCLIDEAN")

ndc_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        ndc_resolver])

ndc_lp = nlp.LightPipeline(ndc_pipelineModel)

sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_ndc download started this may take some time.
[OK!]


In [None]:
text = 'metformin 500 mg'

%time get_codes (ndc_lp, text, vocab='ndc_code')

CPU times: user 18 ms, sys: 4.57 ms, total: 22.5 ms
Wall time: 897 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,metformin 500 mg,0,15,62207-0491,"[62207-0491, 50742-0633, 49483-0622, 62207-0416, 67296-0712, 70010-0063, 75834-0500, 61919-0769, 71800-0008, 70010-0491, 79739-7076, 70882-0124, 42571-0360, 42806-0405, 0069-0136, 49483-0382, 67253-0660, 34645-2105, 59630-0574]","[metformin er 500 mg, metformin 500 mg/1, metformin hcl 500 mg/1, metformin hcl 500 mg/kg, metformin hydrochloride 500 mg/1, metformin hydrochloride 500 mg/500mg, metformin hydrochloride er 500 mg/1, metformin hydrochloride e/r 500 mg/1, metformin hydrochloride 500 mg/5ml, metformin 500 mg/1 tablet, nabumetone 500 mg, metformin hcl 500 mg/1 tablet, metformin hydrochloride oral solution 500 mg/5ml, metformin hydrochloride tablets 500 mg/1, bosulif 500 mg/1, aspirin 500 mg/5001, pyrazinamide 500 mg/1, glycerin 500 mg/ml, fortamet 500 mg/1]","[0.0183, 0.0191, 0.0355, 0.0440, 0.0667, 0.0677, 0.0716, 0.0766, 0.0795, 0.1000, 0.1024, 0.1023, 0.1046, 0.1083, 0.1117, 0.1116, 0.1133, 0.1145, 0.1162]"


In [None]:
text = 'aspirin 81 mg'

%time get_codes (ndc_lp, text, vocab='ndc_code')

CPU times: user 16.3 ms, sys: 6.4 ms, total: 22.7 ms
Wall time: 859 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,aspirin 81 mg,0,12,41250-0780,"[41250-0780, 72036-0080, 17714-0009, 49483-0349, 73089-0081, 46122-0598, 43063-0862, 69168-0318, 68210-0015, 49035-0467, 51824-0055, 68210-0150, 55910-0274, 67651-0361, 41520-0274, 36800-0467, 69168-0288, 52124-0012, 54257-0274, 70692-0126, 49638-0101, 55910-0467]","[aspirin 81 mg, aspirin 81mg, aspirin 81 mg/1, aspirin 81 mg/811, aspirin 81 mg/81mg, aspirin 81 81 mg/1, aspirin ec 81 mg/1, aspirin 81 mg tablet, assured aspirin 81 mg/1, equate aspirin 81 mg/1, careall aspirin 81 mg/1, valumeds aspirin 81 mg/1, rexall aspirin 81 mg/1, aspirin 81 mg/1 capsule, careone aspirin 81 mg/1, topcare aspirin 81 mg/1, aspirin chewable 81 mg/1, chewable aspirin 81 mg/1, enteric coated aspirin 81 mg/1, aspirin 81mg tablet, aspica (aspirin) 81 mg/1, dg health aspirin 81 mg/1]","[0.0000, 0.0152, 0.0181, 0.0253, 0.0317, 0.0323, 0.0338, 0.0353, 0.0389, 0.0388, 0.0411, 0.0416, 0.0422, 0.0503, 0.0546, 0.0547, 0.0574, 0.0631, 0.0644, 0.0643, 0.0672, 0.0677]"


## CPT pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
cpt_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_cpt_procedures_measurements_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("cpt_code")\
      .setDistanceFunction("EUCLIDEAN")

cpt_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        cpt_resolver])

cpt_lp = nlp.LightPipeline(cpt_pipelineModel)


sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_cpt_procedures_measurements_augmented download started this may take some time.
[OK!]


In [None]:
text = 'heart surgery'

%time get_codes (cpt_lp, text, vocab='cpt_code')

CPU times: user 15.5 ms, sys: 3.43 ms, total: 19 ms
Wall time: 323 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,heart surgery,0,12,33999,"[33999, 33258, 1018145, 0304T, 0167T, 93583, 1006057, 33736, 1013106, 33944, 33310, 33548, 33945, 62319, 33430]","[heart surgery [Unlisted procedure, cardiac surgery], Cardiac surgery procedure [Operative tissue ablation and reconstruction of atria, performed at the time of other cardiac procedure(s), extensive (eg, maze procedure), without cardiopulmonary bypass (List separately in addition to code for primary procedure)], Other Cardiac Surgery Procedures [Other Cardiac Surgery Procedures], Heart procedure [Insertion or removal and replacement of intracardiac ischemia monitoring system including imaging supervision and interpretation when performed and intra-operative interrogation and programming when performed; device only], Heart implantation [Transmyocardial transcatheter closure of ventricular septal defect, with implant; with cardiopulmonary bypass], Operation on cardiac septum [Percutaneous transcatheter septal reduction therapy (eg, alcohol septal ablation) including temporary pacemaker insertion when performed], Surgical Procedures on the Heart and Pericardium [Surgical Procedures on the Heart and Pericardium], Open heart atrial septostomy with cardiopulmonary bypass [Atrial septectomy or septostomy; open heart with cardiopulmonary bypass], Repair Procedures of Structural Heart Defect [Repair Procedures of Structural Heart Defect], Preparation of donor heart for transplantation [Backbench standard preparation of cadaver donor heart allograft prior to transplantation, including dissection of allograft from surrounding soft tissues to prepare aorta, superior vena cava, inferior vena cava, pulmonary artery, and left atrium for im...], Exploratory cardiotomy [Cardiotomy, exploratory (includes removal of foreign body, atrial or ventricular thrombus); without bypass], Surgical ventricular restoration procedure [Surgical ventricular restoration procedure, includes prosthetic patch, when performed (eg, ventricular remodeling, SVR, SAVER, Dor procedures)], Heart transplant [Heart transplant, with or without recipient cardiectomy], Insertion of catheter into heart chamber [Injection(s), including indwelling catheter placement, continuous infusion or intermittent bolus, of diagnostic or therapeutic substance(s) (including anesthetic, antispasmodic, opioid, steroid, other solution), not including neurolytic substances, includ], Replacement of mitral valve with cardiopulmonary bypass [Replacement, mitral valve, with cardiopulmonary bypass]]","[0.0000, 0.1031, 0.1078, 0.1377, 0.1574, 0.1645, 0.1703, 0.1780, 0.1779, 0.1784, 0.1891, 0.1871, 0.1890, 0.1904, 0.1932]"


In [None]:
text = 'ct abdomen'

%time get_codes (cpt_lp, text, vocab='cpt_code')

CPU times: user 13.3 ms, sys: 0 ns, total: 13.3 ms
Wall time: 311 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,ct abdomen,0,9,74150,"[74150, 35182, 70481, 62284, 74183, 64760, 76497, 61596, 70486, 76700, 31587, 35092, 0157T, 74175, 70490, 74160, 1002900, 71250, 49436]","[CT scan abdomen [Computed tomography, abdomen; without contrast material], RPR CONGENITAL AV FISTULA THORAX & ABDOMEN [Repair, congenital arteriovenous fistula; thorax and abdomen], CT of orbits [Computed tomography, orbit, sella, or posterior fossa or outer, middle, or inner ear; with contrast material(s)], CT of spine [Injection procedure for myelography and/or computed tomography, lumbar], MRI of abdomen [Magnetic resonance (eg, proton) imaging, abdomen; without contrast material(s), followed by with contrast material(s) and further sequences], Incision or removal of nerve to abdomen [Transection or avulsion of; vagus nerve (vagotomy), abdominal], CT scan [Unlisted computed tomography procedure (eg, diagnostic, interventional)], TRANSCOCHLR POST CRNL FOSSA W/WO MOBIL NRV/ART [Transcochlear approach to posterior cranial fossa, jugular foramen or midline skull base, including labyrinthectomy, decompression, with or without mobilization of facial nerve and/or petrous carotid artery], CT scan of face [Computed tomography, maxillofacial area; without contrast material], US abdominal scan [Ultrasound, abdominal, real time with image documentation; complete], Cricoid split procedure [Laryngoplasty, cricoid split, without graft placement], Procedure on artery of abdomen (procedure) [Direct repair of aneurysm, pseudoaneurysm, or excision (partial or total) and graft insertion, with or without patch graft; for ruptured aneurysm, abdominal aorta involving visceral vessels (mesenteric, celiac, renal)], Opening of abdomen [Laparotomy, implantation or replacement of gastric stimulation electrodes, lesser curvature (ie, morbid obesity)], CT of abdominal vascular structures (procedure) [Computed tomographic angiography, abdomen, with contrast material(s), including noncontrast images, if performed, and image postprocessing], CT scan of neck [Computed tomography, soft tissue neck; without contrast material], CT of abdomen with contrast [Computed tomography, abdomen; with contrast material(s)], Anesthesia for Procedures on the Upper Abdomen [Anesthesia for Procedures on the Upper Abdomen], Diagnostic CT scan of chest [Computed tomography, thorax, diagnostic; without contrast material], Creation of exit site for catheter in abdominal cavity [Delayed creation of exit site from embedded subcutaneous segment of intraperitoneal cannula or catheter]]","[0.0756, 0.0985, 0.1099, 0.1268, 0.1351, 0.1422, 0.1526, 0.1571, 0.1570, 0.1534, 0.1619, 0.1549, 0.1559, 0.1614, 0.1638, 0.1645, 0.1591, 0.1694, 0.1672]"


In [None]:
text = 'Left heart cath'

%time get_codes (cpt_lp, text, vocab='cpt_code')

CPU times: user 12.1 ms, sys: 1.31 ms, total: 13.4 ms
Wall time: 307 ms


Unnamed: 0,chunks,begin,end,code,all_codes,resolutions,all_distances
0,Left heart cath,0,14,93462,"[93462, 33415, 93607, 33988, 33989, 93620, 93514, 36014, 93452, 0281T, 33975, 93531]","[Cardiac catheterization, left heart [Left heart catheterization by transseptal puncture through intact septum or by transapical puncture (List separately in addition to code for primary procedure)], Incision or removal of tissue below left lower heart valve [Resection or incision of subvalvular tissue for discrete subvalvular aortic stenosis], Left ventricular recording (Deprecated) [Left ventricular recording (Deprecated)], Insertion of left heart vent through chest for blood oxygenation rewarming and return [Insertion of left heart vent by thoracic incision (eg, sternotomy, thoracotomy) for ECMO/ECLS], RMVL LEFT HEART VENT BY THORACIC INCIS ECMO/ECLS [Removal of left heart vent by thoracic incision (eg, sternotomy, thoracotomy) for ECMO/ECLS], Left ventricular recording [Comprehensive electrophysiologic evaluation including insertion and repositioning of multiple electrode catheters with induction or attempted induction of arrhythmia; with right atrial pacing and recording, right ventricular pacing and recording, His bund], Left heart catheterization by left ventricular puncture (Deprecated) [Left heart catheterization by left ventricular puncture (Deprecated)], Insertion of catheter into left pulmonary artery [Selective catheter placement, left or right pulmonary artery], Insertion of catheter into left heart including imaging interpretation and supervision and injection [Left heart catheterization including intraprocedural injection(s) for left ventriculography, imaging supervision and interpretation, when performed], Plication of left atrial appendage [Percutaneous transcatheter closure of the left atrial appendage with implant, including fluoroscopy, transseptal puncture, catheter placement(s), left atrial angiography, left atrial appendage angiography, radiological supervision and interpretation], Insertion of external lower heart chamber blood flow assist device [Insertion of ventricular assist device; extracorporeal, single ventricle], Cardiac catheterization, combined right and left heart [Combined right heart catheterization and retrograde left heart catheterization, for congenital cardiac anomalies]]","[0.1060, 0.1104, 0.1272, 0.1367, 0.1424, 0.1433, 0.1611, 0.1639, 0.1683, 0.1678, 0.1731, 0.1687]"


## SNOMED pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
snomed_ct_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_findings_aux_concepts","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("snomed_code")\
      .setDistanceFunction("EUCLIDEAN")

snomed_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        snomed_ct_resolver])

snomed_lp = nlp.LightPipeline(snomed_pipelineModel)


In [None]:
text = 'bladder cancer'

%time get_codes (snomed_lp, text, vocab='snomed_code', aux_label=True)

In [None]:
text = 'schizophrenia'

%time get_codes (snomed_lp, text, vocab='snomed_code', aux_label=True)

## LOINC Pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
loinc_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_loinc_augmented", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("loinc_code")\
      .setDistanceFunction("EUCLIDEAN")

loinc_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        loinc_resolver])

loinc_lp = nlp.LightPipeline(loinc_pipelineModel)

In [None]:
text = 'FLT3 gene mutation analysis'

%time get_codes (loinc_lp, text, vocab='loinc_code')

In [None]:
text = 'Hematocrit'

%time get_codes (loinc_lp, text, vocab='loinc_code')

In [None]:
text = 'vomiting'

%time get_codes (loinc_lp, text, vocab='loinc_code')

## HCPCS Pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
hcpcs_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_hcpcs", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("hcpcs_code")\
      .setDistanceFunction("EUCLIDEAN")

hcpcs_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        hcpcs_resolver])

hcpcs_lp = nlp.LightPipeline(hcpcs_pipelineModel)

In [None]:
text = "Breast prosthesis, mastectomy bra, with integrated breast prosthesis form, unilateral, any size, any type"

%time get_codes (hcpcs_lp, text, vocab='hcpcs_code')

In [None]:
text= "Dietary consultation for carbohydrate counting for type I diabetes."
%time get_codes (hcpcs_lp, text, vocab='hcpcs_code')

In [None]:
text= "Psychiatric consultation for alcohol withdrawal and dependance."
%time get_codes (hcpcs_lp, text, vocab='hcpcs_code')

## UMLS Pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
umls_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_umls_major_concepts", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("umls_code")\
      .setDistanceFunction("EUCLIDEAN")

umls_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        umls_resolver])

umls_lp = nlp.LightPipeline(umls_pipelineModel)

In [None]:
# medical device
text = 'X-Ray'

%time get_codes (umls_lp, text, vocab='umls_code')

In [None]:
# Injuries & poisoning
text = 'out-of-date food poisoning'

%time get_codes (umls_lp, text, vocab='umls_code')

In [None]:
# clinical findings
text = 'type two diabetes mellitus'

%time get_codes (umls_lp, text, vocab='umls_code')

## MeSH pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
mesh_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_mesh","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("mesh_code")\
      .setDistanceFunction("EUCLIDEAN")

mesh_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        mesh_resolver])

mesh_lp = nlp.LightPipeline(mesh_pipelineModel)


In [None]:
text = 'chest pain '

%time get_codes (mesh_lp, text, vocab='mesh_code')

In [None]:
text = 'pericardectomy'

%time get_codes (mesh_lp, text, vocab='mesh_code')

## HPO Pipeline

In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
hpo_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_HPO", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("umls_code")\
      .setDistanceFunction("EUCLIDEAN")

hpo_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        hpo_resolver])

hpo_lp = nlp.LightPipeline(hpo_pipelineModel)

In [None]:
text = 'bladder cancer'

%time get_codes (hpo_lp, text, vocab='umls_code')

In [None]:
text = 'bipolar disorder'

%time get_codes (hpo_lp, text, vocab='umls_code')

In [None]:
text = 'schizophrenia '

%time get_codes (hpo_lp, text, vocab='umls_code')

## All the resolvers in the same pipeline 
### (just to show how it is done..  will not be used in this notebook)

In [None]:

documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("ner_chunk")

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
      .setInputCols(["ner_chunk"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)
    
snomed_ct_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_snomed_findings","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("snomed_code")\
      .setDistanceFunction("EUCLIDEAN")

icd10_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("icd10cm_code")\
      .setDistanceFunction("EUCLIDEAN")

rxnorm_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_rxnorm_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("rxnorm_code")\
      .setDistanceFunction("EUCLIDEAN")

cpt_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_cpt_procedures_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("cpt_code")\
      .setDistanceFunction("EUCLIDEAN")

hcc_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented_billable_hcc","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("icd10cm_hcc_code")\
      .setDistanceFunction("EUCLIDEAN")

loinc_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_loinc_augmented", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("loinc_code")\
      .setDistanceFunction("EUCLIDEAN")

hcpcs_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_hcpcs", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("hcpcs_code")\
      .setDistanceFunction("EUCLIDEAN")

umls_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_umls_major_concepts", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("umls_code")\
      .setDistanceFunction("EUCLIDEAN")

hpo_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_HPO", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("umls_code")\
      .setDistanceFunction("EUCLIDEAN")

mesh_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_mesh","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("mesh_code")\
      .setDistanceFunction("EUCLIDEAN")

ndc_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_ndc", "en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("ndc_code")\
      .setDistanceFunction("EUCLIDEAN")


resolver_pipelineModel = nlp.PipelineModel(
    stages = [
        documentAssembler,
        sbert_embedder,
        icd10_resolver,
        hcc_resolver,
        rxnorm_resolver,
        ndc_resolver,
        cpt_resolver,
        snomed_ct_resolver,
        loinc_resolver,
        hcpcs_resolver,
        umls_resolver,
        mesh_resolver,
        hpo_resolver])

resolver_lp = nlp.LightPipeline(resolver_pipelineModel)


## Getting some predictions from resolvers

In [None]:
icd_chunks = ['obesity',
'a respiratory tract infection',
'basal cell carcinoma of skin',
'acute maxillary sinusitis',
'chronic kidney disease stage',
'diabetes mellitus type 2',
'lymph nodes of multiple sites',
'other chronic pain',
'severe abdominal pain',
'squamous cell carcinoma of skin',
'type 2 diabetes mellitus']

snomed_chunks= ['down syndrome', 'adenocarcinoma', 'aortic valve stenosis',
       'atherosclerosis', 'atrial fibrillation',
       'hypertension', 'lung cancer', 'seizure',
       'squamous cell carcinoma', 'stage IIIB', 'mediastinal lymph nodes']

In [None]:
from IPython.display import display

for chunk in icd_chunks:

    print ('>> ',chunk)
    display(get_codes (icd_lp, chunk, vocab='icd10cm_code'))

In [None]:
for chunk in snomed_chunks:

    print ('>> ',chunk)
    display(get_codes (snomed_lp, chunk, vocab='snomed_code', aux_label=True))

In [None]:
clinical_chunks = ['bladder cancer',
 'anemia in chronic kidney disease',
 'castleman disease',
 'congestive heart failure',
 'diabetes mellitus type 2',
 'lymph nodes of multiple sites',
 'malignant melanoma of skin',
 'malignant neoplasm of lower lobe, bronchus',
 'metastatic lung cancer',
 'secondary malignant neoplasm of bone',
 'type 2 diabetes mellitus',
 'type 2 diabetes mellitus/insulin',
 'unsp malignant neoplasm of lymph node']


for chunk in clinical_chunks:

    print ('>> ',chunk)
    
    print ('icd10cm_code')
    display(get_codes (hcc_lp, chunk, vocab='icd10cm_hcc_code', hcc=True))
    
    print ('snomed_code')
    display(get_codes (snomed_lp, chunk, vocab='snomed_code', aux_label=True))

##  How to integrate resolvers with NER models in the same pipeline


In [None]:
documentAssembler = nlp.DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentenceDetector = nlp.SentenceDetectorDLModel.pretrained()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")\

word_embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

clinical_ner = medical.NerModel.pretrained("ner_clinical", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

ner_converter = nlp.NerConverter() \
      .setInputCols(["sentence", "token", "ner"]) \
      .setOutputCol("ner_chunk")\
      .setWhiteList(['PROBLEM'])

c2doc = nlp.Chunk2Doc()\
      .setInputCols("ner_chunk")\
      .setOutputCol("ner_chunk_doc") 

sbert_embedder = nlp.BertSentenceEmbeddings\
      .pretrained("sbiobert_base_cased_mli",'en','clinical/models')\
      .setInputCols(["ner_chunk_doc"])\
      .setOutputCol("sbert_embeddings")\
      .setCaseSensitive(False)

icd10_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("icd10cm_code")\
      .setDistanceFunction("EUCLIDEAN")
  
sbert_resolver_pipeline = nlp.Pipeline(
    stages = [
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter,
        c2doc,
        sbert_embedder,
        icd10_resolver])

data_ner = spark.createDataFrame([[""]]).toDF("text")

sbert_models = sbert_resolver_pipeline.fit(data_ner)

In [None]:
clinical_note = 'A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, associated with an acute hepatitis, and obesity with a body mass index (BMI) of 33.5 kg/m2, presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. Two weeks prior to presentation, she was treated with a five-day course of amoxicillin for a respiratory tract infection. She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months at the time of presentation. Physical examination on presentation was significant for dry oral mucosa; significantly, her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent laboratory findings on admission were: serum glucose 111 mg/dl, bicarbonate 18 mmol/l, anion gap 20, creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, glycated hemoglobin (HbA1c) 10%, and venous pH 7.27. Serum lipase was normal at 43 U/L. Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia. The patient was initially admitted for starvation ketosis, as she reported poor oral intake for three days prior to admission. However, serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL, the anion gap was still elevated at 21, serum bicarbonate was 16 mmol/L, triglyceride level peaked at 2050 mg/dL, and lipase was 52 U/L. The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again. The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL, within 24 hours. Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use. The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely. She had close follow-up with endocrinology post discharge.'

print (clinical_note)

clinical_note_df = spark.createDataFrame([[clinical_note]]).toDF("text")


In [None]:
icd10_result = sbert_models.transform(clinical_note_df)

In [None]:
import pyspark.sql.functions as F

pd.set_option('display.max_colwidth', 0)

def get_icd_codes(icd10_res):
    
    icd10_df = icd10_res.select(F.explode(F.arrays_zip(icd10_res.ner_chunk.result, 
                                                       icd10_res.ner_chunk.metadata, 
                                                       icd10_res.icd10cm_code.result, 
                                                       icd10_res.icd10cm_code.metadata)).alias("cols")) \
                                 .select(F.expr("cols['1']['sentence']").alias("sent_id"),
                                         F.expr("cols['0']").alias("ner_chunk"),
                                         F.expr("cols['1']['entity']").alias("entity"), 
                                         F.expr("cols['2']").alias("icd10_code"),
                                         F.expr("cols['3']['all_k_results']").alias("all_codes"),
                                         F.expr("cols['3']['all_k_resolutions']").alias("resolutions")).toPandas()


    
    codes = []
    resolutions = []

    for code, resolution in zip(icd10_df['all_codes'], icd10_df['resolutions']):

        codes.append(code.split(':::'))
        resolutions.append(resolution.split(':::'))

    icd10_df['all_codes'] = codes  
    icd10_df['resolutions'] = resolutions
    
    return icd10_df

In [None]:
%%time

res_pd = get_icd_codes(icd10_result)

In [None]:
res_pd.head(15)

Lets apply some HTML formating by using `sparknlp_display` library to see the results of the pipeline in a nicer layout:

In [None]:
# with light pipeline
light_model = nlp.LightPipeline(sbert_models)

vis = viz.EntityResolverVisualizer()

# Change color of an entity label
vis.set_label_colors({'PROBLEM':'#008080'})

light_data_icd = light_model.fullAnnotate(clinical_note)

vis.display(light_data_icd[0], 'ner_chunk', 'icd10cm_code')


## BertSentenceChunkEmbeddings

This annotator let users to aggregate sentence embeddings and ner chunk embeddings to get more specific and accurate resolution codes. It works by averaging context and chunk embeddings to get contextual information. Input to this annotator is the context (sentence) and ner chunks, while the output is embedding for each chunk that can be fed to the resolver model. The `setChunkWeight` parameter can be used to control the influence of surrounding context. 

For more information and examples of `BertSentenceChunkEmbeddings` annotator, you can check here: 
[24.1.Improved_Entity_Resolution_with_SentenceChunkEmbeddings.ipynb](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/24.1.Improved_Entity_Resolution_with_SentenceChunkEmbeddings.ipynb)

### ICD10CM with BertSentenceChunkEmbeddings

Lets do the same process by using `BertSentenceEmbeddings` annotator and compare the results. We will create a new pipeline by using this annotator with SentenceEntityResolverModel.

In [None]:
#Get average sentence-chunk Bert embeddings
sentence_chunk_embeddings = medical.BertSentenceChunkEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models")\
    .setInputCols(["sentence", "ner_chunk"])\
    .setOutputCol("sbert_embeddings")\
    .setCaseSensitive(False)\
    .setChunkWeight(0.5) #default : 0.5


icd10_resolver = medical.SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented","en", "clinical/models") \
      .setInputCols(["ner_chunk", "sbert_embeddings"]) \
      .setOutputCol("icd10cm_code")\
      .setDistanceFunction("EUCLIDEAN")
  
resolver_pipeline_SCE = nlp.Pipeline(
    stages = [
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter,
        sentence_chunk_embeddings,
        icd10_resolver])


empty_data = spark.createDataFrame([['']]).toDF("text")
model_SCE = resolver_pipeline_SCE.fit(empty_data)

In [None]:
icd10_result_SCE = model_SCE.transform(clinical_note_df)

In [None]:
%%time

res_SCE_pd = get_icd_codes(icd10_result_SCE)

In [None]:
res_SCE_pd.head(15)

In [None]:
icd10_SCE_lp = nlp.LightPipeline(model_SCE)

light_result = icd10_SCE_lp.fullAnnotate(clinical_note)

visualiser = viz.EntityResolverVisualizer()

# Change color of an entity label
visualiser.set_label_colors({'PROBLEM':'#008080'})

visualiser.display(light_result[0], 'ner_chunk', 'icd10cm_code')

**Lets compare the results that we got from these two methods.**

In [None]:
sentence_df = icd10_result.select(F.explode(F.arrays_zip(icd10_result.sentence.metadata, 
                                                         icd10_result.sentence.result)).alias("cols")) \
                                .select( F.expr("cols['0']['sentence']").alias("sent_id"),
                                         F.expr("cols['1']").alias("sentence_all")).toPandas()

comparison_df = pd.merge(res_pd.loc[:,'sent_id':'resolutions'],res_SCE_pd.loc[:,'sent_id':'resolutions'], on=['sent_id',"ner_chunk", "entity"], how='inner')
comparison_df.columns=['sent_id','ner_chunk', 'entity', 'icd10_code', 'all_codes', 'resolutions', 'icd10_code_SCE', 'all_codes_SCE', 'resolutions_SCE']

comparison_df = pd.merge(sentence_df, comparison_df,on="sent_id").drop('sent_id', axis=1)
comparison_df.head(15)