<img src="https://nlp.johnsnowlabs.com/assets/images/logo.png" width="180" height="50" style="float: left;">

# COLAB ENVIRONMENT SETUP

In [1]:
# Make sure you have aws credentials, license and secret code for the desired version in your GDrive
# Specify the base path in drive in `colab_path`, their names, and the desired version
mount_path = '/content/gdrive'
colab_path = 'My Drive/Colab Notebooks'
aws_credentials_filename = 'credentials'
license_filename = 'license'
secret_filename = 'secret246'
version = "2.4.6"

# Fundamental Import and installation of Java
import os, shutil
from google.colab import drive
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Licensed Environment Setup
def setup_license_from_gdrive(mount_path, colab_path, aws_credentials_filename, license_filename):
    drive.mount(mount_path, force_remount=True)
    aws_dir = "/root/.aws"
    if not os.path.exists(aws_dir):
        os.makedirs(aws_dir)
    shutil.copyfile(os.path.join(mount_path, colab_path, aws_credentials_filename),os.path.join(aws_dir, "credentials"))
    with open(os.path.join(mount_path, colab_path, license_filename), "r") as f:
        license = f.readline().replace("\n","")
        os.environ["JSL_NLP_LICENSE"] = license
    with open(os.path.join(mount_path, colab_path, secret_filename), "r") as f:
        secret = f.readline().replace("\n","")
    return secret

secret = setup_license_from_gdrive(mount_path, colab_path, aws_credentials_filename, license_filename)

# Intallation of Spark NLP Enterprise and its pyhon dependencies
! pip install spark-nlp-jsl==$version --extra-index-url https://pypi.johnsnowlabs.com/$secret

# Spark NLP Imports
from pyspark.sql import SparkSession
import sparknlp, sparknlp_jsl
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

# Creation of suitable SparkSession with proper JARS
spark = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "32G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "1G") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
        .config("spark.jars", f"https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl-2.4.6.jar")\
        .getOrCreate()

print("spark version:", spark.version)
print("spark-nlp version:", sparknlp.version())
print("spark-nlp-jsl version:", sparknlp_jsl.version())

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/spHgYRUswf
Collecting spark-nlp-jsl==2.4.6
  Downloading https://pypi.johnsnowlabs.com/spHgYRUswf/spark-nlp-jsl/spark-nlp-jsl-2.4.6.tar.gz
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c1

### Step 1. Prepare the environment
#### The previous cell should have taken care of all the setup

# ICD10 Entity Resolution - version 2.4.6

## Example for Named Entity Recognition with Entity Resolution Pipeline
A common NLP problem in biomedical aplications is to identify the presence of clinical entities in a given text. This clinical entities could be diseases, symptoms, drugs, results of clinical investigations or others.

In this example we will use Spark-NLP to identify all the entities present in a typical clinical note.

The clinical note (taken from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6553675/) is as follows:

<div style="border:2px solid #747474; background-color: #e3e3e3; margin: 5px; padding: 10px"> 
<p>A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior episode of HTG-induced pancreatitis three years prior to presentation, and obesity with a body mass index (BMI) of 33.5 kg/m2, presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. Two weeks prior to presentation, she was treated with a five-day course of amoxicillin for a respiratory tract infection. She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months at the time of presentation.</p>

<p>Physical examination on presentation was significant for dry oral mucosa; significantly, her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent laboratory findings on admission were: serum glucose 111 mg/dl, bicarbonate 18 mmol/l, anion gap 20, creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, glycated hemoglobin (HbA1c) 10%, and venous pH 7.27. Serum lipase was normal at 43 U/L. Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia.</p>

<p>The patient was initially admitted for starvation ketosis, as she reported poor oral intake for three days prior to admission. However, serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL, the anion gap was still elevated at 21, serum bicarbonate was 16 mmol/L, triglyceride level peaked at 2050 mg/dL, and lipase was 52 U/L. The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again. The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL, within 24 hours. Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use. The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely. She had close follow-up with endocrinology post discharge.</p>
</div>

We will use Spark-NLP capabilities to identify a list of medical problems, treatments and medical tests, and then try to assign a ICD-10 code to each element of this list.

In [0]:
import sys, os, time
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.util import *
from sparknlp_jsl.annotator import *

from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel

### Step 2. Clinical NER Pipeline creation

In Spark-NLP annotating NLP happens through pipelines. Pipelines are made out of various Annotator steps. In our case the architecture of the Clinical Named Entity Recognition pipeline will be:

* DocumentAssembler (text -> document)
* SentenceDetector (document -> sentence)
* Tokenizer (sentence -> token)
* WordEmbeddingsModel ([sentence, token] -> embeddings)
* NerDLModel ([sentence, token, embeddings] -> ner)

So from a text we end having a list of Named Entities (Patient problems, Treatments and Tests).

#### Step 2.1 Initialize all the annotators required by the pipeline

The first 3 annotators of the pipeline are "DocumentAssembler", "SentenceDectector" and "Tokenizer":

In [0]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

The fourth annotator in the pipeline is "WordEmbeddingsModel". We will download a pretrained model available from "clinical/models" named "embeddings_clinical".

When running this cell your are advised to be patient. 

First time you call this pretrained model it needs to be downloaded in your local.

The model size is about will download the embeddings_clinical corpus it takes a while.

The size is about 1.7Gb and will be saved typically in your home folder as

    ~HOMEFOLDER/cached_models/embeddings_clinical_en_2.0.2_2.4_1558454742956.zip

Next times you call it the model is loaded from your cached copy but even in that case it needs to be indexed each time so expect waiting up to 5 minutes (depending on your machine)

In [4]:
# WordEmbeddingsModel pretrained "embeddings_clinical" includes a model of 1.7Gb that needs to be downloaded

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


The fifth and final annotator in our NER pipeline is the pretrained "ner_clinical" NerDLModel avaliable from "clinical/models". It requires as input the "sentence", "token" and "embeddings" (clinical embeddings pretrained model) and will classify each token in four categories:
<ol>
    <li>PROBLEM: for patient problems</li>
    <li>TEST: for tests, labs, etc.</li>
    <li>TREATMENT: for treatments, medicines, etc.</li>
    <li>OTHER: for the rest of tokens.</li>
</ol>

In order to split those identified NER that are consecutive, the B prefix (as B-PROBLEM) will be used at the first token of each NER. The I prefix (as I-PROBLEM) will be used for the rest of tokens inside the NER.

In [5]:
# Named Entity Recognition for clinical concepts.

clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_clinical download started this may take some time.
Approximate size to download 13.8 MB
[OK!]


#### Step 2.2 Define the NER pipeline

Now we will define the actual pipeline that puts together the annotators we have created.

In [0]:
# Build up the pipeline

pipeline_ner = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner
  ])

### Step 3 Create a SparkDataFrame with the content

Now we will create a sample Spark dataframe with our clinical note example.

In this example we are working over a unique clinical note. In production environments a table with several of those clinical notes could be distributed in a cluster and be run in large scale systems.

In [0]:
# reference https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6553675/

clinical_note = (
    'A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years '
    'prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior '
    'episode of HTG-induced pancreatitis three years prior to presentation, associated '
    'with an acute hepatitis, and obesity with a body mass index (BMI) of 33.5 kg/m2, '
    'presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. '
    'Two weeks prior to presentation, she was treated with a five-day course of amoxicillin '
    'for a respiratory tract infection. She was on metformin, glipizide, and dapagliflozin '
    'for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months '
    'at the time of presentation. Physical examination on presentation was significant for dry oral mucosa; '
    'significantly, her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent '
    'laboratory findings on admission were: serum glucose 111 mg/dl, bicarbonate 18 mmol/l, anion gap 20, '
    'creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, glycated hemoglobin (HbA1c) '
    '10%, and venous pH 7.27. Serum lipase was normal at 43 U/L. Serum acetone levels could not be assessed '
    'as blood samples kept hemolyzing due to significant lipemia. The patient was initially admitted for '
    'starvation ketosis, as she reported poor oral intake for three days prior to admission. However, '
    'serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL, the anion gap '
    'was still elevated at 21, serum bicarbonate was 16 mmol/L, triglyceride level peaked at 2050 mg/dL, and '
    'lipase was 52 U/L. The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - '
    'the original sample was centrifuged and the chylomicron layer removed prior to analysis due to '
    'interference from turbidity caused by lipemia again. The patient was treated with an insulin drip '
    'for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL, within '
    '24 hours. Her euDKA was thought to be precipitated by her respiratory tract infection in the setting '
    'of SGLT2 inhibitor use. The patient was seen by the endocrinology service and she was discharged on '
    '40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg '
    'two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely. She '
    'had close follow-up with endocrinology post discharge.'
)

data_ner = spark.createDataFrame([[clinical_note]]).toDF("text")

In [8]:
data_ner.show()

+--------------------+
|                text|
+--------------------+
|A 28-year-old fem...|
+--------------------+



### Step 4 Create a model fitting the NER pipeline with the clinical note.

Now we can use the pipeline and the clinical note to generate a model.
Even though the WordEmbeddings and NER models are pretrained models we need to call the fit method given we want the previous "estimators" in the pipeline to return the respective "transformers" (as in Estimator / Transformer ML API)

In [0]:
# We convert the pipeline into a model fitting our clinical note (data).
model_ner = pipeline_ner.fit(data_ner)

### Step 5 Transform / annotate the clinical note using the model.

In order to process the data with the new created model we have two options.

The first one would be to use the model to transform our clinical note by the command:

<code>output = model_ner.transform(data_ner)</code>

That would save in a Spakr DataFrame (output) the resuls of running the model over the clinical note. 

However for small tests like this or for real-time request a LightPipelines is a simpler way of managing the data. It will return a dictionary (instead of a Spark DataFrame) with the results of the transformation

We will create a light_pipeline_ner using our model_ner and then will annotate the clinical_note using this light_pipeline.

In [0]:
light_pipeline = LightPipeline(model_ner)
light_data = light_pipeline.annotate(clinical_note)

Now we have a dictionaty (light_data_ner) that contains the results of running the NER pipeline over our clinical note.

It contains the original document:

In [11]:
light_data['document'][0][0:100]

'A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to '

The 17 sentences:

In [12]:
print("Number of sentences: {}".format(len(light_data['sentence'])))
print("")
for i in range(5):
    print("Sentence {}: {}".format(i, light_data['sentence'][i][0:80]))

Number of sentences: 17

Sentence 0: A 28-year-old female with a history of gestational diabetes mellitus diagnosed e
Sentence 1: Two weeks prior to presentation, she was treated with a five-day course of amoxi
Sentence 2: She was on metformin, glipizide, and dapagliflozin for T2DM and atorvastatin and
Sentence 3: She had been on dapagliflozin for six months at the time of presentation.
Sentence 4: Physical examination on presentation was significant for dry oral mucosa;


And the 437 tokens with their assigned NER class.

In [13]:
print("Number of tokens: {}".format(len(light_data['token'])))
print("")
for i in range(25):
    print("Token {}: {} ({})".format(i, light_data['token'][i][0:20], light_data['ner'][i]))
print("...")

Number of tokens: 436

Token 0: A (O)
Token 1: 28-year-old (O)
Token 2: female (O)
Token 3: with (O)
Token 4: a (O)
Token 5: history (O)
Token 6: of (O)
Token 7: gestational (B-PROBLEM)
Token 8: diabetes (I-PROBLEM)
Token 9: mellitus (I-PROBLEM)
Token 10: diagnosed (O)
Token 11: eight (O)
Token 12: years (O)
Token 13: prior (O)
Token 14: to (O)
Token 15: presentation (O)
Token 16: and (O)
Token 17: subsequent (O)
Token 18: type (B-PROBLEM)
Token 19: two (I-PROBLEM)
Token 20: diabetes (I-PROBLEM)
Token 21: mellitus (I-PROBLEM)
Token 22: ( (O)
Token 23: T2DM (B-PROBLEM)
Token 24: ), (I-PROBLEM)
...


Lets apply some HTML formating to see the results of the pipeline in a nicer layout:

In [14]:
html_output = '<center><h2>Results of NER Annotation Pipeline</h2></center>'
html_output += '<div style="border:2px solid #747474; background-color: #e3e3e3; margin: 5px; padding: 10px">'
problem_flag = False
new_problem = []
problem_list = []
for index, this_token in enumerate(light_data['token']):
    
    if light_data['ner'][index] in ['B-PROBLEM','I-PROBLEM']:
        if problem_flag == False:
            new_problem = [this_token]
        else:
            new_problem.append(this_token)
        problem_flag = True
    else:
        if problem_flag == True:
            problem_list.append(new_problem)
            new_problem = []
        problem_flag = False

    if light_data['ner'][index]=='O':
        html_output+=this_token + " "
    elif light_data['ner'][index]=='B-PROBLEM':
        html_output+='<SPAN style="background-color: #ffffcc">' + this_token + " </SPAN>"
    elif light_data['ner'][index]=='I-PROBLEM':
        html_output+='<SPAN style="background-color: #ffffcc">' + this_token + " </SPAN>"
    elif light_data['ner'][index]=='B-TEST':
        html_output+='<SPAN style="background-color: pink">' + this_token + " </SPAN>"
    elif light_data['ner'][index]=='I-TEST':
        html_output+='<SPAN style="background-color: pink">' + this_token + " </SPAN>"
    elif light_data['ner'][index]=='B-TREATMENT':
        html_output+='<SPAN style="background-color:  #cce6ff">' + this_token + " </SPAN>"
    elif light_data['ner'][index]=='I-TREATMENT':
        html_output+='<SPAN style="background-color:  #cce6ff">' + this_token + " </SPAN>"
    

html_output += '</div>'
        
html_output += '<div>Color codes: <SPAN style="background-color: #ffffcc">Patient problem</SPAN>, '
html_output += '<SPAN style="background-color: pink">Test</SPAN>, '
html_output += '<SPAN style="background-color: #cce6ff">Treatment</SPAN>'

from IPython.core.display import display, HTML
display(HTML(html_output))

### Step 6 ICD10 coding Pipeline creation.

After running the NER Pipeline we have been able to extract a list of "Patient Problems" that is printed below:

In [15]:
for problem in problem_list:
    print(" ".join(problem))

gestational diabetes mellitus
type two diabetes mellitus
T2DM ),
HTG-induced pancreatitis
an acute hepatitis
obesity
a body mass index
BMI ) of 33.5 kg/m2
polyuria
polydipsia
poor appetite
vomiting
a respiratory tract infection
T2DM
HTG
tenderness
guarding
rigidity
significant lipemia
starvation ketosis
poor oral intake
still elevated
interference from turbidity
lipemia
euDKA
HTG
a reduction in the anion gap
Her euDKA
her respiratory tract infection


We will now create a new pipeline that from each of these problems will try to assign an ICD10 base on the content, the wordembeddings and some pretrained models for ICD10 annotation.

The architecture of this new pipeline will be as follows:
* DocumentAssembler (text -> document)
* SentenceDetector (document -> sentence)
* Tokenizer (sentence -> token)
* WordEmbeddingsModel ([sentence, token] -> embeddings)
* NerDLModel ([sentence, token, embeddings] -> ner)
* NerConverter (["sentence, token, ner] -> ner_chunk
* ChunkTokenizer (ner_chunk -> ner_chunk_tokenized)
* ICD10CMEntityResolverModel ([ner_chunk_tokenized, embeddings] -> resolution)
* ICD10PCSEntityResolverModel ([ner_chunk_tokenized, embeddings] -> resolution)

So from a text we end having a list of Named Entities (ner_chunk) and their ICD10 codes (resolution)

Most of the annotators in this pipeline have been already created for the previous pipeline, but we need to create four additional annotators: NerConverter, ChunkEmbeddigns, EntityResolverModel for ICD10CM and EntityResolverModel for ICD10PCS.

Now we define the new pipeline

In [16]:
# Named Entity Recognition concepts parser, transforms entities into CHUNKS (required for next step: assertion status)

ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")

chunk_embeddings = ChunkEmbeddings()\
    .setInputCols("ner_chunk", "embeddings")\
    .setOutputCol("chunk_embeddings")

chunk_tokenizer = ChunkTokenizer()\
    .setInputCols("ner_chunk")\
    .setOutputCol("ner_token")

# ICD resolution model

icd10cm_resolution = ChunkEntityResolverModel.pretrained("chunkresolve_icd10cm_clinical", "en", "clinical/models") \
  .setInputCols(["ner_token", "chunk_embeddings"]) \
  .setOutputCol("icd10cm_code") \
  .setDistanceFunction("COSINE")  \
  .setNeighbours(5)

chunkresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 166.3 MB
[OK!]


In [17]:
#RxNorm Resolution
rxnorm_resolution = EnsembleEntityResolverModel.pretrained("ensembleresolve_rxnorm_clinical", "en", "clinical/models")\
    .setInputCols("ner_token","chunk_embeddings").setOutputCol("rxnorm_result")

ensembleresolve_rxnorm_clinical download started this may take some time.
Approximate size to download 783.3 MB
[OK!]


In [0]:
# Build up the pipeline

pipeline_icd10 = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter,
    chunk_embeddings,
    chunk_tokenizer,
    icd10cm_resolution
  ])

model_icd10 = pipeline_icd10.fit(data_ner)

For each of the patient problems identified in the clinical note, we can run the ICD10 resolution pipeline and in case we found a candidate ICD10 code we print the results.

In [0]:
light_pipeline_icd10 = LightPipeline(model_icd10)

In [20]:
# Now for the list of Patient Problem entities we will run the LightPipeline
problem_list_str = [" ".join(this_problem) for this_problem in problem_list]

print("====================================================================================")
print("ICD10 codes identified for the list of patients problems found in the clinical note.")
print("====================================================================================")

for problem in problem_list_str:
    this_r = light_pipeline_icd10.annotate(problem)
    if(len(this_r['icd10cm_code'])>0):
        print("{} >>> ICD10CM: {}".format(this_r['sentence'][0], this_r['icd10cm_code'][0]))

ICD10 codes identified for the list of patients problems found in the clinical note.
gestational diabetes mellitus >>> ICD10CM: P702
type two diabetes mellitus >>> ICD10CM: E1142
T2DM ), >>> ICD10CM: E1121
HTG-induced pancreatitis >>> ICD10CM: B252
an acute hepatitis >>> ICD10CM: B172
obesity >>> ICD10CM: E661
a body mass index >>> ICD10CM: Z681
BMI ) of 33.5 kg/m2 >>> ICD10CM: Z6828
polyuria >>> ICD10CM: R358
polydipsia >>> ICD10CM: R631
poor appetite >>> ICD10CM: R630
vomiting >>> ICD10CM: R1114
a respiratory tract infection >>> ICD10CM: J989
T2DM >>> ICD10CM: E1121
HTG >>> ICD10CM: E781
tenderness >>> ICD10CM: R10815
guarding >>> ICD10CM: R1083
rigidity >>> ICD10CM: R1935
significant lipemia >>> ICD10CM: E891
starvation ketosis >>> ICD10CM: E71121
poor oral intake >>> ICD10CM: F5082
still elevated >>> ICD10CM: E7841
interference from turbidity >>> ICD10CM: M2656
lipemia >>> ICD10CM: R81
euDKA >>> ICD10CM: R0602
HTG >>> ICD10CM: E781
a reduction in the anion gap >>> ICD10CM: M2653
He

In case you see there are some imprecisions made by the semantic / syntactic nature of the recognized chunks, you can always take a deeper look at all the alternatives.

One possible way, would be using the provided metadata like the following:

In [0]:
output = model_icd10.transform(data_ner).cache()

In [24]:
output.select(F.explode(F.arrays_zip("ner_chunk.result","ner_chunk.metadata",
                                     "icd10cm_code.result","icd10cm_code.metadata")).alias("icd10cm_result")) \
.select(F.expr("icd10cm_result['0']").alias("chunk"),
        F.expr("icd10cm_result['1'].entity").alias("entity"),
        F.expr("icd10cm_result['3'].resolved_text").alias("resolved_text"),
        F.expr("icd10cm_result['2']").alias("code"),
        F.expr("icd10cm_result['3'].all_k_resolutions").alias("cms"),
        F.expr("icd10cm_result['3'].confidence_ratio").alias("confidence_ratio")) \
.filter(F.expr("entity='PROBLEM' and code is not null").alias("entity")) \
.distinct() \
.orderBy("confidence_ratio", ascending=False) \
.toPandas()

Unnamed: 0,chunk,entity,resolved_text,code,cms,confidence_ratio
0,polyuria,PROBLEM,Other polyuria,R358,Other polyuria:::Polydipsia:::Generalized edem...,3.4604
1,interference from turbidity,PROBLEM,Non-working side interference,M2656,Non-working side interference:::Hemoglobinuria...,1.7357
2,an acute hepatitis,PROBLEM,Acute hepatitis E,B172,Acute hepatitis E:::Autoimmune hepatitis:::Mum...,1.573
3,starvation ketosis,PROBLEM,Propionic acidemia,E71121,Propionic acidemia:::Bartter's syndrome:::Hypo...,1.4464
4,rigidity,PROBLEM,Periumbilic abdominal rigidity,R1935,Periumbilic abdominal rigidity:::Generalized a...,1.2509
5,HTG-induced pancreatitis,PROBLEM,Cytomegaloviral pancreatitis,B252,Cytomegaloviral pancreatitis:::Alcohol-induced...,1.1966
6,still elevated,PROBLEM,Elevated Lipoprotein(a),E7841,"Elevated Lipoprotein(a):::Fever, unspecified::...",1.172
7,poor appetite,PROBLEM,Anorexia,R630,"Anorexia:::Nutritional deficiency, unspecified...",1.1651
8,type two diabetes mellitus,PROBLEM,Type 2 diabetes mellitus with diabetic polyneu...,E1142,Type 2 diabetes mellitus with diabetic polyneu...,1.1382
9,gestational diabetes mellitus,PROBLEM,Neonatal diabetes mellitus,P702,Neonatal diabetes mellitus:::Neonatal diabetes...,1.1146


In [0]:
output_rx = rxnorm_resolution.transform(output).cache()

In [27]:
output_rx.select(F.explode(F.arrays_zip("ner_chunk.result","ner_chunk.metadata","rxnorm_result.result","rxnorm_result.metadata")).alias("rxnorm_result")) \
.select(F.expr("rxnorm_result['0']").alias("chunk"),
        F.expr("rxnorm_result['1'].entity").alias("entity"),
        F.expr("rxnorm_result['3'].all_k_resolutions").alias("target_text"),
        F.expr("rxnorm_result['2']").alias("code"),
        F.expr("rxnorm_result['3'].alternative_codes").alias("alternative_codes"),
        F.expr("rxnorm_result['3'].confidence_ratio").alias("conf")) \
.filter(F.expr("entity in ('TREATMENT')").alias("entity")) \
.distinct() \
.orderBy("conf", ascending=False) \
.toPandas()

Unnamed: 0,chunk,entity,target_text,code,alternative_codes,conf
0,all SGLT2 inhibitors,TREATMENT,MAO inhibitors:::Prolactine inhibitors:::Renin...,1431605,,1.0715
1,SGLT2 inhibitor,TREATMENT,Protease inhibitor:::Enzyme inhibitor:::Phosph...,386943,,1.0033
2,dapagliflozin,TREATMENT,DAPAGLIFLOZIN:::dapagliflozin:::Dapagliflozin:...,1488564,,1.0
3,an insulin drip,TREATMENT,NPH insulin:::NPH Insulin:::insulin argine:::I...,1605101,,1.0
4,gemfibrozil,TREATMENT,Gemfibrozil:::GEMFIBROZIL:::gemfibrozil:::Gemf...,4719,,1.0
5,glipizide,TREATMENT,Glipizide:::GLIPIZIDE:::glipizide:::Glipizide:...,4821,,1.0
6,amoxicillin,TREATMENT,Amoxicillin:::amoxicillin:::AMOXICILLIN:::Amox...,723,,1.0
7,insulin lispro,TREATMENT,Lispro insulin:::Insulin Lispro:::insulin lisp...,86009,,1.0
8,insulin glargine,TREATMENT,Insulin Glargine:::insulin glargine:::Insulin ...,274783,,1.0
9,metformin,TREATMENT,Metformin:::metformin:::METFORMIN:::Metformin:...,6809,,1.0
