![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Training Entity Coding Models (SNOMED example)

In [0]:
import os
import json
import string
import numpy as np
import pandas as pd


import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.types import StructType, StructField, StringType

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 100)  
pd.set_option('display.expand_frame_repr', False)


print('sparknlp_jsl.version : ',sparknlp_jsl.version())

spark

## Load datasets

In [0]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/AskAPatient.fold-0.test.txt
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/AskAPatient.fold-0.train.txt
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/AskAPatient.fold-0.validation.txt
  
dbutils.fs.cp("file:/databricks/driver/AskAPatient.fold-0.test.txt", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/AskAPatient.fold-0.train.txt", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/AskAPatient.fold-0.validation.txt", "dbfs:/")

In [0]:
import pandas as pd

cols = ["conceptId","_term","term"]

aap_tr = pd.read_csv("file:/databricks/driver/AskAPatient.fold-0.train.txt",sep="\t",encoding="ISO-8859-1",header=None)
aap_tr.columns = cols
aap_tr["conceptId"] = aap_tr.conceptId.apply(str)

aap_ts = pd.read_csv("file:/databricks/driver/AskAPatient.fold-0.test.txt",sep="\t",header=None)
aap_ts.columns = cols
aap_ts["conceptId"] = aap_ts.conceptId.apply(str)

aap_vl = pd.read_csv("file:/databricks/driver/AskAPatient.fold-0.validation.txt",sep="\t",header=None)
aap_vl.columns = cols
aap_vl["conceptId"] = aap_vl.conceptId.apply(str)

In [0]:
aap_vl.head()

Unnamed: 0,conceptId,_term,term
0,267032009,Tired all the time,persisten feeling of tiredness
1,22298006,Myocardial infarction,HEART ATTACK
2,3877011000036101,Lipitor,LIPITOR
3,415690000,Sweating,sweated
4,248491001,Swollen knee,swelling at knee


In [0]:
aap_train_sdf = spark.createDataFrame(aap_tr)
aap_test_sdf = spark.createDataFrame(aap_ts)
aap_val_sdf = spark.createDataFrame(aap_vl)

# Chunk Entity Resolver (Glove Embeddings)

## Create Training Pipeline

In [0]:
document = DocumentAssembler()\
    .setInputCol("term")\
    .setOutputCol("document")

chunk = Doc2Chunk()\
    .setInputCols("document")\
    .setOutputCol("chunk")\

token = Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models")\
      .setInputCols(["document", "token"])\
      .setOutputCol("embeddings")

chunk_emb = ChunkEmbeddings()\
      .setInputCols("chunk", "embeddings")\
      .setOutputCol("chunk_embeddings")

snomed_training_pipeline = Pipeline(
    stages = [
    document,
    chunk,
    token,
    embeddings,
    chunk_emb])

snomed_training_model = snomed_training_pipeline.fit(aap_train_sdf)
snomed_data = snomed_training_model.transform(aap_train_sdf).cache()


In [0]:
snomed_extractor = ChunkEntityResolverApproach() \
    .setInputCols("token", "chunk_embeddings") \
    .setOutputCol("recognized") \
    .setNeighbours(1000) \
    .setAlternatives(25) \
    .setNormalizedCol("_term") \
    .setLabelCol("conceptId") \
    .setEnableWmd(True).setEnableTfidf(True).setEnableJaccard(True)\
    .setEnableSorensenDice(True).setEnableJaroWinkler(True).setEnableLevenshtein(True)\
    .setDistanceWeights([1, 2, 2, 1, 1, 1]) \
    .setAllDistancesMetadata(True)\
    .setPoolingStrategy("MAX") \
    .setThreshold(1e32)

In [0]:
#%time 
model = snomed_extractor.fit(snomed_data)

## Prediction Pipeline

In [0]:
prediction_Model = PipelineModel(stages=[snomed_training_model, model])

In [0]:
aap_train_pred= prediction_Model.transform(aap_train_sdf).cache()
aap_test_pred= prediction_Model.transform(aap_test_sdf).cache()
aap_val_pred= prediction_Model.transform(aap_val_sdf).cache()

In [0]:
aap_test_pred.selectExpr("conceptId","term","_term","recognized[0].result","recognized[0].metadata.resolved_text","recognized[0].metadata.all_k_resolutions").show(truncate=50)

In [0]:
aap_test_pred.selectExpr("conceptId","term","_term","recognized[0].result","recognized[0].metadata.resolved_text","recognized[0].metadata.all_k_resolutions").show(truncate=50)

## Train Using the entire dataset

In [0]:
all_data = aap_train_sdf.union(aap_test_sdf).union(aap_val_sdf)

snomed_training_model = snomed_training_pipeline.fit(all_data)

snomed_data = snomed_training_model.transform(all_data).cache()

%time model = snomed_extractor.fit(snomed_data)

In [0]:
! cd /databricks/driver/ && mkdir models

In [0]:
model.write().overwrite().save("dbfs:/databricks/driver/models/chunkresolve_snomed_askapatient_hc_100d")

In [0]:
%sh cd /databricks/driver/models  && ls -la


## Prediction on random texts

In [0]:
documentAssembler = DocumentAssembler()\
  .setInputCol("term")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")\
  .setCustomBounds([","])

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("raw_token")\

stopwords = StopWordsCleaner()\
  .setInputCols(["raw_token"])\
  .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d", "en", "clinical/models")\
  .setInputCols(["document", "token"])\
  .setOutputCol("embeddings")

clinical_ner = MedicalNerModel.pretrained("ner_healthcare", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

snomed_ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("greedy_chunk")\
  .setWhiteList(['PROBLEM','TEST'])

chunk_embeddings = ChunkEmbeddings()\
  .setInputCols('greedy_chunk', 'embeddings')\
  .setOutputCol('chunk_embeddings')

snomed_resolver = ChunkEntityResolverModel.load("dbfs:/databricks/driver/models/chunkresolve_snomed_askapatient_hc_100d")\
  .setInputCols("token","chunk_embeddings")\
  .setOutputCol("snomed_resolution")

pipeline_snomed = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    stopwords,
    word_embeddings,
    clinical_ner,
    snomed_ner_converter,
    chunk_embeddings,
    snomed_resolver
  ])

empty_data = spark.createDataFrame([['']]).toDF("term")

model_snomed = pipeline_snomed.fit(empty_data)


In [0]:
model_snomed_lp = LightPipeline(model_snomed)


In [0]:
result = model_snomed_lp.annotate('I have a biceps muscle pain and extreme muscle pain in shoulders')

list(zip(result['greedy_chunk'],result['snomed_resolution']))

In [0]:
result = model_snomed_lp.annotate('I have a flu and a headache')

list(zip(result['greedy_chunk'],result['snomed_resolution']))

In [0]:
from pyspark.sql import functions as F

snomed_output = model_snomed.transform(spark.createDataFrame([['I have a biceps muscle pain and extreme muscle pain in shoulders']]).toDF("term"))

snomed_output.select(F.explode(F.arrays_zip("greedy_chunk.result","greedy_chunk.metadata","snomed_resolution.result","snomed_resolution.metadata")).alias("snomed_result")) \
             .select(F.expr("snomed_result['0']").alias("chunk"),
                     F.expr("snomed_result['1'].entity").alias("entity"),
                     F.expr("snomed_result['3'].all_k_resolutions").alias("target_text"),
                     F.expr("snomed_result['2']").alias("code"),
                     F.expr("snomed_result['3'].confidence").alias("confidence")).show(truncate = 100)

# Sentence Entity Resolver (BioBert sentence embeddings) (after v2.7)

In [0]:
aap_train_sdf.show()

In [0]:
aap_train_sdf.printSchema()

In [0]:
documentAssembler = DocumentAssembler()\
    .setInputCol("_term")\
    .setOutputCol("sentence")

bert_embeddings = BertSentenceEmbeddings.pretrained("sent_biobert_pubmed_base_cased")\
    .setInputCols(["sentence"])\
    .setOutputCol("bert_embeddings")

snomed_training_pipeline = Pipeline(
    stages = [
    documentAssembler,
    bert_embeddings])


snomed_training_model = snomed_training_pipeline.fit(aap_train_sdf)
snomed_data = snomed_training_model.transform(aap_train_sdf)


In [0]:
bertExtractor = SentenceEntityResolverApproach()\
    .setNeighbours(25)\
    .setThreshold(1000)\
    .setInputCols("bert_embeddings")\
    .setNormalizedCol("_term")\
    .setLabelCol("conceptId")\
    .setOutputCol('snomed_code')\
    .setDistanceFunction("EUCLIDIAN")\
    .setCaseSensitive(False)

%time snomed_model = bertExtractor.fit(snomed_data)

In [0]:
# save if you will need that later
model.write().overwrite().save("dbfs:/databricks/driver/models/biobertresolve_snomed_askapatient")

In [0]:
prediction_Model = PipelineModel(stages=[snomed_training_model, snomed_model])

aap_train_pred= prediction_Model.transform(aap_train_sdf).cache()
aap_test_pred= prediction_Model.transform(aap_test_sdf).cache()
aap_val_pred= prediction_Model.transform(aap_val_sdf).cache()

In [0]:
aap_test_pred.selectExpr("conceptId","term","_term","snomed_code[0].result","snomed_code[0].metadata.resolved_text","snomed_code[0].metadata.all_k_resolutions").show(truncate=50)

In [0]:
aap_val_pred.selectExpr("conceptId","term","_term","snomed_code[0].result","snomed_code[0].metadata.resolved_text","snomed_code[0].metadata.all_k_resolutions").show(truncate=50)

End of Notebook # 13