![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/ICD10CM_MS_DRG_MAPPER.ipynb)

# Colab Setup

In [1]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql.types import StringType, IntegerType

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 5.1.0
Spark NLP_JSL Version : 5.1.0


In [4]:
document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

chunkMapper = DocMapperModel.pretrained("icd10cm_ms_drg_mapper", "en", "clinical/models")\
      .setInputCols(["document"])\
      .setOutputCol("mappings")\
      .setRels(["ms-drg"])

pipeline2 = Pipeline().setStages([document_assembler,
                                 chunkMapper])

model2 = pipeline2.fit(spark.createDataFrame([['']]).toDF('text'))

lp2 = LightPipeline(model2)

icd10cm_ms_drg_mapper download started this may take some time.
[OK!]


# ICD10CM Code with mapper

In [5]:
example_codes=["A0102", "A157", "B0872", "B160", "C004", "D0005", "E002", "E15", "F0631", "F28", "G003", "G10", "H00031", "H10421", "I00", "I052", "L00", "L104", "M0020", "M2654", "N005", "N10", "O00202", "O10311", "P0511", "P103", "Q001", "Q104", "U071", "U099", "T513X1A", "K005", "R253", "S1084XA"]


In [6]:
final_res=lp2.fullAnnotate(example_codes)

In [7]:
res_list=[]
for code in final_res:
  code_res=code['mappings']
  for res in code_res:
    res_list.append(res.result)

In [8]:
res_df = pd.DataFrame({'icd10cm_code': example_codes, 'mappings': res_list})

In [9]:
res_df.head(50)

Unnamed: 0,icd10cm_code,mappings
0,A0102,Typhoid fever with heart involvement
1,A157,Primary respiratory tuberculosis
2,B0872,Yaba pox virus disease
3,B160,Acute hepatitis B with delta-agent with hepatic coma
4,C004,"Malignant neoplasm of lower lip, inner aspect"
5,D0005,Carcinoma in situ of hard palate
6,E002,"Congenital iodine-deficiency syndrome, mixed type"
7,E15,Nondiabetic hypoglycemic coma
8,F0631,Mood disorder due to known physiological condition with depressive features
9,F28,Other psychotic disorder not due to a substance or known physiological condition


# ICD10CM Code with resolver and mapper

## sbiobertresolve_icd10cm_augmented_billable_hcc

In [10]:
resolver_model_name="sbiobertresolve_icd10cm_augmented_billable_hcc"

In [11]:
document_assembler2 = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetectorDL = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", 'clinical/models') \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("word_embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "word_embeddings"]) \
    .setOutputCol("ner")

ner_converter_icd = NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")\
    .setWhiteList(['PROBLEM'])\
    .setPreservePosition(False)

c2doc = Chunk2Doc()\
    .setInputCols("ner_chunk")\
    .setOutputCol("doc_ner_chunk")

sbert_embedder = BertSentenceEmbeddings.pretrained('sbiobert_base_cased_mli', 'en','clinical/models')\
    .setInputCols(["doc_ner_chunk"])\
    .setOutputCol("sentence_embeddings")\
    .setCaseSensitive(False)

icd_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented_billable_hcc","en", "clinical/models") \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("icd10cm_code")\
    .setDistanceFunction("EUCLIDEAN")

resolver2chunk = Resolution2Chunk()\
    .setInputCols(["icd10cm_code"]) \
    .setOutputCol("resolver2chunk")\

chunkToDoc = Chunk2Doc()\
            .setInputCols("resolver2chunk")\
            .setOutputCol("document2")

chunkMapper = DocMapperModel.pretrained("icd10cm_ms_drg_mapper", "en", "clinical/models")\
      .setInputCols(["document2"])\
      .setOutputCol("mappings")\
      .setRels(["ms-drg"])

resolver_pipeline = Pipeline(
    stages = [
        document_assembler2,
        sentenceDetectorDL,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter_icd,
        c2doc,
        sbert_embedder,
        icd_resolver,
        resolver2chunk,
        chunkToDoc,
        chunkMapper
  ])


empty_data = spark.createDataFrame([['']]).toDF("text")
model = resolver_pipeline.fit(empty_data)
light_model = LightPipeline(model)



sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icd10cm_augmented_billable_hcc download started this may take some time.
[OK!]
icd10cm_ms_drg_mapper download started this may take some time.
[OK!]


In [12]:
text_list = [
["""Complaints of unspecified upper abdominal pain and swelling in a 32-year-old woman led to the evaluation of possible disease of intestine. She had a history of mixed irritable bowel syndrome and congenital lactase deficiency. Several diagnostic tests were performed. Blood tests showed abnormal results of blood chemistry and colonoscopy was performed."""],
["""This is a 77-year-old male, who presents with gross hematuria that started this morning. The patient is a difficult historian, does have a unspecified speech disturbances, slow and poor responsiveness to questions, but daughter was able to answer a lot of questions too. He is complaining of no other pain. He denies abdominal pain. Denies hemorrhage. He had a history with similar to this a year ago where it began the same with gross hematuria. He was discharged after a workup in the hospital, in the emergency room, with Levaquin."""],
["""An 86-year-old female with persistent abdominal pain, nausea and projectile vomiting, during evaluation in the emergency room, was found to have a high amylase, as well as lipase count and she is being admitted for management of unspecified gastrointestinal hemorrhage."""]]

### Display icd10cm code

In [13]:
from sparknlp_display import EntityResolverVisualizer
visualiser = EntityResolverVisualizer()

for text in text_list:
  light_result = light_model.fullAnnotate(text)
  visualiser.display(light_result[0], 'ner_chunk', 'icd10cm_code')
  print("\n\n")

















### Get ICD10_CM_MS_DRG_MAPPER result

In [14]:
data_ner = spark.createDataFrame(text_list).toDF("text")
icd10_result = model.transform(data_ner)
result_df = icd10_result.select(F.explode(F.arrays_zip('ner_chunk.result',
                                                           'ner_chunk.metadata',
                                                           'icd10cm_code.result',
                                                           'icd10cm_code.metadata',
                                                           'ner_chunk.begin',
                                                           'ner_chunk.end',
                                                            'mappings.result')).alias("cols")) \
                                     .select(F.expr("cols['1']['sentence']").alias("sent_id"),
                                             F.expr("cols['4']").alias("begin"),
                                             F.expr("cols['5']").alias("end"),
                                             F.expr("cols['0']").alias("ner_chunk"),
                                             F.expr("cols['1']['entity']").alias("entity"),
                                             F.expr("cols['2']").alias("icd10cm_code"),
                                             F.expr("cols['3']['resolved_text']").alias("icd10cm_code_description"),
                                             F.expr("cols['6']").alias("mappings") ).toPandas()


In [15]:
result_df.head(50)

Unnamed: 0,sent_id,begin,end,ner_chunk,entity,icd10cm_code,icd10cm_code_description,mappings
0,0,14,45,unspecified upper abdominal pain,PROBLEM,R10.9,unspecified abdominal pain [unspecified abdominal pain],Unspecified abdominal pain
1,0,51,58,swelling,PROBLEM,R60.9,"swelling [edema, unspecified]","Edema, unspecified"
2,0,117,136,disease of intestine,PROBLEM,K63.9,"disease of intestine [disease of intestine, unspecified]","Disease of intestine, unspecified"
3,1,160,189,mixed irritable bowel syndrome,PROBLEM,K58.2,mixed irritable bowel syndrome [mixed irritable bowel syndrome],Mixed irritable bowel syndrome
4,1,195,223,congenital lactase deficiency,PROBLEM,E73.0,congenital lactase deficiency [congenital lactase deficiency],Congenital lactase deficiency
5,3,286,320,abnormal results of blood chemistry,PROBLEM,R79.9,"blood chemistry abnormal [abnormal finding of blood chemistry, unspecified]","Abnormal finding of blood chemistry, unspecified"
6,0,46,60,gross hematuria,PROBLEM,R31.0,gross hematuria [gross hematuria],Gross hematuria
7,1,137,169,a unspecified speech disturbances,PROBLEM,R47.9,unspecified speech disturbances [unspecified speech disturbances],Unspecified speech disturbances
8,1,172,199,slow and poor responsiveness,PROBLEM,R46.4,slowness and poor responsiveness [slowness and poor responsiveness],Slowness and poor responsiveness
9,2,295,304,other pain,PROBLEM,G89.29,other chronic pain [other chronic pain],Other chronic pain
