![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Clinical Entity Resolvers

In [3]:
import os

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

In [4]:
# if you want to load the licensed models from S3 with your license key

! pip install awscli --upgrade

In [5]:
spark

# Clinical Resolvers

## Entity Resolvers for ICD-10

In [8]:
import pandas as pd
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F
import string
import numpy as np
import sparknlp
from sparknlp.util import *
from sparknlp.pretrained import ResourceDownloader
from pyspark.sql import functions as F
from sparknlp_jsl.annotator import *

In [9]:
from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")


In [10]:
!aws s3 cp s3://auxdata.johnsnowlabs.com/clinical/models/chunkresolve_icd10cm_clinical_en_2.4.2_2.4_1583085234727.zip chunkresolve_icd10cm_clinical_en_2.4.2_2.4_1583085234727.zip

In [11]:
!unzip chunkresolve_icd10cm_clinical_en_2.4.2_2.4_1583085234727.zip -d chunkresolve_icd10cm

In [12]:
%fs ls "file:/databricks/driver"

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/ganglia/,ganglia/,4096
file:/databricks/driver/derby.log,derby.log,726
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/pubmed_sample_text_small.csv,pubmed_sample_text_small.csv,9363435
file:/databricks/driver/i2b2_assertion_sample.csv,i2b2_assertion_sample.csv,141585
file:/databricks/driver/chunkresolve_icd10cm_clinical_en_2.4.2_2.4_1583085234727.zip,chunkresolve_icd10cm_clinical_en_2.4.2_2.4_1583085234727.zip,105024355
file:/databricks/driver/chunkresolve_icd10cm/,chunkresolve_icd10cm/,4096
file:/databricks/driver/logs/,logs/,4096


In [13]:
dbutils.fs.cp("file:/databricks/driver/chunkresolve_icd10cm", "dbfs:/chunkresolve_icd10cm", recurse=True)

In [14]:
%fs ls "dbfs:/"

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/GMB_NER.conll,GMB_NER.conll,1663772
dbfs:/andres/,andres/,0
dbfs:/chunkresolve_icd10cm/,chunkresolve_icd10cm/,0
dbfs:/cluster-logs/,cluster-logs/,0
dbfs:/context-spell-checker/,context-spell-checker/,0
dbfs:/databricks/,databricks/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/datasets/,datasets/,0


In [15]:
icd_ner_converter = NerConverter()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")\
.setWhiteList(['PROBLEM'])

chunk_embeddings = ChunkEmbeddings()\
    .setInputCols("ner_chunk", "embeddings")\
    .setOutputCol("chunk_embeddings")\

chunk_tokenizer = ChunkTokenizer()\
    .setInputCols("ner_chunk")\
    .setOutputCol("ner_token")


entity_resolver_icd10 = ChunkEntityResolverModel.load("dbfs:/chunkresolve_icd10cm")\
         .setInputCols(["ner_token", "chunk_embeddings"])\
         .setOutputCol("icd10cm_code")\
         .setDistanceFunction("COSINE")

pipeline_icd10 = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    icd_ner_converter,
    chunk_embeddings,
    chunk_tokenizer,
    entity_resolver_icd10
  ])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model_icd10 = pipeline_icd10.fit(empty_data)


In [16]:
%sh
wget 	https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed_sample_text_small.csv


In [17]:
import pyspark.sql.functions as F

pubMedDF = spark.read\
                .option("header", "true")\
                .csv("pubmed_sample_text_small.csv").repartition(4)
                
pubMedDF.show(truncate=50)

In [18]:
result = model_icd10.transform(pubMedDF.limit(100))

In [19]:
result.show()

In [20]:
def get_icd10_codes(model, text):
    
    data = spark.createDataFrame([[text]]).toDF("text")
        
    output = model.transform(data)
    
    result = output.select(F.explode(F.arrays_zip("ner_chunk.begin",
                                         "ner_chunk.end",
                                         "ner_chunk.result", "ner_chunk.metadata", "icd10cm_code.result", "icd10cm_code.metadata")).alias("icd10cm_result")) \
    .select(F.expr("icd10cm_result['2']").alias("ner"),
            F.expr("icd10cm_result['3'].entity").alias("entity"),
            F.expr("icd10cm_result['0']").alias("begin"),
            F.expr("icd10cm_result['1']").alias("end"),
            F.expr("icd10cm_result['4']").alias("code"),
            F.expr("icd10cm_result['5'].resolved_text").alias("resolved_text"),
            F.expr("icd10cm_result['5'].alternative_codes").alias("alternative_codes"),
            F.expr("round(icd10cm_result['5'].confidence_ratio,2)").alias("confidence_ratio"))\
    .distinct().toPandas()

    return result

In [21]:
text = 'He has a brain damage and lung cancer'

df = get_icd10_codes(model_icd10, text)

df

Unnamed: 0,ner,entity,begin,end,code,resolved_text,alternative_codes,confidence_ratio
0,lung cancer,PROBLEM,26,36,C220,Liver cell carcinoma,C61:|:C61:|:J675:|:J674:|:J670:|:J672:|:J670:|...,1.38
1,a brain damage,PROBLEM,7,20,Z87820,Personal history of traumatic brain injury,P112:|:Z13850:|:P112:|:G931:|:P111:|:S062X7A:|...,1.02


# RxNorm Resolver

In [23]:
rxnorm_resolution_l1 = DocumentLogRegClassifierModel.pretrained("resolve_rxnorm_clinical_l1", "en", "clinical/models")\
    .setInputCols("ner_token").setOutputCol("partition")

rxnorm_resolution_l2 = ResourceDownloader.downloadPipeline("resolve_rxnorm_clinical_l2", "en", "clinical/models")

rxnorm_resolution = PipelineModel([rxnorm_resolution_l1, RecursivePipelineModel(rxnorm_resolution_l2)])

posology_ner_model_large = NerDLModel.pretrained('ner_posology', "en", "clinical/models")\
            .setInputCols(["sentence", "token", "embeddings"]) \
          .setOutputCol("ner")

drug_converter = NerConverter()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")\
.setWhiteList(['DRUG'])

pipeline_rx = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    word_embeddings,
    posology_ner_model_large,
    drug_converter,
    chunk_embeddings,
    chunk_tokenizer,
    rxnorm_resolution
  ])

model_rxnorm = pipeline_rx.fit(empty_data)


In [24]:

def get_rxnorm_codes(text):
    
    data = spark.createDataFrame([[text]]).toDF("text")
        
    output = model_rxnorm.transform(data)
    
    result = output.select(F.explode(F.arrays_zip("ner_chunk.begin",
                                     "ner_chunk.end",
                                     "ner_chunk.result", "ner_chunk.metadata", "rxnorm_code.result", "rxnorm_code.metadata")).alias("rxnorm_result")) \
    .select(F.expr("rxnorm_result['2']").alias("ner"),
            F.expr("rxnorm_result['3'].entity").alias("entity"),
            F.expr("rxnorm_result['0']").alias("begin"),
            F.expr("rxnorm_result['1']").alias("end"),
            F.expr("rxnorm_result['4']").alias("code"),
            F.expr("rxnorm_result['5'].resolved_text").alias("resolved_text"),
            F.expr("rxnorm_result['5'].alternative_codes").alias("alternative_codes"),
            F.expr("round(rxnorm_result['5'].confidence_ratio,2)").alias("confidence_ratio")) \
    .distinct()\
    .toPandas()
    
    return result



In [25]:

text = 'He has a brain damage and needs to take an Advil and Aspirin'

get_rxnorm_codes(text)

Unnamed: 0,ner,entity,begin,end,code,resolved_text,alternative_codes,confidence_ratio
0,Advil,DRUG,43,47,352893,phoslo gelcap,1941952:|:1318187:|:827207:|:19711,1.01
1,Aspirin,DRUG,53,59,238143,aspirin pwdr,238143:|:687078:|:1986390:|:226716,1.0


## Snomed Resolver

In [27]:
 #SNOMED Resolution
ner_snomed_resolver_l1 = DocumentLogRegClassifierModel.pretrained("resolve_snomed_clinical_l1", "en", "clinical/models")\
    .setInputCols("ner_token").setOutputCol("partition")

ner_snomed_resolver_l2 = ResourceDownloader.downloadPipeline("resolve_snomed_clinical_l2", "en", "clinical/models")


ner_snomed_resolver_l2.stages[-1].setInputCols("partition","ner_token","chunk_embeddings")


In [28]:
from pyspark.ml import PipelineModel

snomed_resolution = PipelineModel([ner_snomed_resolver_l1, RecursivePipelineModel(ner_snomed_resolver_l2)])


In [29]:
clinical_converter = NerConverter()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")


def get_snomed_model():
    pipeline_snomed = Pipeline(
        stages = [
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        posology_ner_model_large,
        clinical_converter,
        chunk_embeddings,
        chunk_tokenizer,
        snomed_resolution
      ])

    model_snomed = pipeline_snomed.fit(empty_data)
    
    return model_snomed


model_snomed = get_snomed_model()

In [30]:
def get_snomed_codes(text):
    
    data = spark.createDataFrame([[text]]).toDF("text")
    
    output = model_snomed.transform(data)
        
    result = output.select(F.explode(F.arrays_zip("ner_chunk.begin",
                                     "ner_chunk.end",
                                         "ner_chunk.result", "ner_chunk.metadata", "snomed_code.result", "snomed_code.metadata")).alias("snomed_result")) \
    .select(F.expr("snomed_result['2']").alias("ner"),
            F.expr("snomed_result['3'].entity").alias("entity"),
            F.expr("snomed_result['0']").alias("begin"),
            F.expr("snomed_result['1']").alias("end"),
            F.expr("snomed_result['4']").alias("code"),
            F.expr("snomed_result['5'].resolved_text").alias("resolved_text"),
            F.expr("snomed_result['5'].alternative_codes").alias("alternative_codes"),
            F.expr("round(snomed_result['5'].confidence_ratio,2)").alias("confidence_ratio")) \
    .distinct()\
    .toPandas()

    return result


In [31]:
text = 'He has a brain damage and needs to take Parol or Aspirin'

df = get_snomed_codes(text)

df

Unnamed: 0,ner,entity,begin,end,code,resolved_text,alternative_codes,confidence_ratio
0,Aspirin,DRUG,49,55,312452009.0,aspirin prophylaxis,717854002:|:413081008:|:243195000:|:243195000,1.05
1,Parol,DRUG,40,44,,,,
