In [None]:
#Please make sure you have SparkNLP 2.4.1 and SparkNLP Enterprise 2.4.1

In [3]:
import sys, time
sys.path.append("/home/fernandrez/JSL/repos/spark-nlp/python")
sys.path.append("/home/fernandrez/JSL/repos/spark-nlp-internal/python")

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader
import pyspark.sql.functions as F
#from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import StringIndexerModel
from pyspark.ml.classification import OneVsRestModel

In [5]:
concepts = concepts = spark.read.format("csv").option("header","true").load("../../../../data/resolution/snomed_sample.csv")\
.withColumn("term", F.expr("lower(term)"))

In [6]:
tokenizer_chars = ["'",",","/"," ",".","|","@","#","%","&","$","[","]","(",")","-",";","="]

In [7]:
docAssembler = DocumentAssembler().setInputCol("term").setOutputCol("document")

tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")\
    .setSplitChars(tokenizer_chars)

pipelineModel = Pipeline().setStages([docAssembler, tokenizer]).fit(concepts)

In [8]:
ngrammer = NGramGenerator()\
    .setInputCols(["token"])\
    .setOutputCol("ngram")\
    .setEnableCumulative(True)\
    .setDelimiter("_")

ngramToken = Chunk2Token()\
    .setInputCols("ngram")\
    .setOutputCol("ngram_token")

pipelineNgrams = PipelineModel(stages=[
    ngrammer,
    ngramToken])

In [9]:
embeddingsModel = WordEmbeddingsModel.pretrained("embeddings_icdoem_2ng", "en", "clinical/models")\
    .setInputCols("document", "token")\
    .setOutputCol("embeddings")

embeddings_icdoem_2ng download started this may take some time.
Approximate size to download 10.9 GB
[OK!]


In [10]:
doc2Chunk = Doc2Chunk().setInputCols("document").setOutputCol("chunk")

chunkEmbeddings = ChunkEmbeddings()\
    .setInputCols("chunk", "embeddings")\
    .setOutputCol("chunk_embeddings")

pipelineChunkEmbeddings = PipelineModel([doc2Chunk, chunkEmbeddings])

In [11]:
concepts_embedded = PipelineModel([pipelineModel, pipelineNgrams, embeddingsModel, pipelineChunkEmbeddings]).transform(concepts)

In [12]:
concepts_embedded.write.mode("overwrite").save("data/concepts_embedded")

In [13]:
concepts_embedded = spark.read.load("data/concepts_embedded")

In [14]:
#Let's check embeddings coverage
concepts_embedded.selectExpr("conceptId","explode(embeddings) as embs")\
.selectExpr("conceptId","case when embs.metadata.isOOV=='false' then 1 else 0 end as coverage")\
.groupby("conceptId").agg(F.expr("avg(coverage) as cov")).orderBy("cov").toPandas()["cov"].mean()

0.8656463292885626

In [15]:
word_distribution = concepts_embedded.selectExpr("explode(token.result) as word").groupby("word").count()
word_distribution.orderBy("count",ascending=True).show(100, False)

+-----------+-----+
|word       |count|
+-----------+-----+
|(          |1680 |
|of         |1518 |
|disorder   |409  |
|procedure  |337  |
|structure  |325  |
|finding    |222  |
|body       |209  |
|product    |202  |
|to         |181  |
|entire     |154  |
|organism   |146  |
|with       |141  |
|in         |141  |
|substance  |138  |
|and        |130  |
|-          |120  |
|by         |112  |
|1          |102  |
|containing |100  |
|form       |98   |
|left       |81   |
|2          |76   |
|only       |73   |
|right      |71   |
|dose       |68   |
|neoplasm   |67   |
|for        |67   |
|on         |67   |
|measurement|64   |
|:          |63   |
|artery     |60   |
|joint      |59   |
|value      |56   |
|qualifier  |56   |
|or         |53   |
|0          |52   |
|5          |52   |
|due        |50   |
|medicinal  |49   |
|skin       |47   |
|blood      |47   |
|antibody   |46   |
|virus      |46   |
|oral       |44   |
|3          |44   |
|tissue     |43   |
|poisoning  |43   |


In [16]:
word_distribution.count()

5316

In [17]:
#Currently working on making the first layer available using the Pretrained framework -> 2.4.2
#model_idx = ResourceDownloader.downloadModel("StringIndexerModel", "resolve_snomed_l1_idx_icdoem_2ng", "en", "clinical/models")
#model_tfidf = ResourceDownloader.downloadPipeline("resolve_snomed_l1_tfidf_icdoem_2ng", "en", "clinical/models")
#model_ovrlrc = ResourceDownloader.downloadModel("OneVsRestModel", "resolve_snomed_l1_ovrlrc_icdoem_2ng", "en", "clinical/models")

In [19]:
sidx = StringIndexerModel.load("_models/snomed_indexer")

In [20]:
layer1 = DocumentLogRegClassifierApproach()\
    .setInputCols("ngram_token")\
    .setOutputCol("partition")\
    .setLabels(sidx.labels)\
    .setVectorizationModelPath("_models/snomed_tfidfer")\
    .setClassificationModelPath("_models/snomed_ovrlrc")\
    .fit(concepts_embedded)\
    .setMergeChunks(False)

In [21]:
#Second layer is available through the Pretrained framework
layer_2 = RecursivePipelineModel(
    ResourceDownloader.downloadPipeline("resolve_snomed_l2_icdoem_2ng", "en", "clinical/models")
)

resolve_snomed_l2_icdoem_2ng download started this may take some time.
Approx size to download 1.1 GB
[OK!]


In [22]:
fullPipeline = Pipeline().setStages([layer1, layer_2]).fit(concepts_embedded)

In [23]:
start = time.time()
transformed_full = fullPipeline.transform(concepts_embedded)

In [24]:
predicted = transformed_full.withColumn("prediction", F.expr("partition.result[0]")).cache()
metrics = predicted.withColumn("ok",F.expr("case when prediction==topTerm then 1 else 0 end"))\
                                   .groupby("topTerm").agg(F.expr("avg(ok) as recall"), F.expr("count(ok) as tr_cnt"))\
                                    .join(
predicted.withColumn("ok",F.expr("case when prediction==topTerm then 1 else 0 end"))\
                                   .groupby("prediction").agg(F.expr("avg(ok) as precision")),F.col("topTerm")==F.col("prediction")
).withColumn("f1", F.expr("2*precision*recall/(precision+recall)")).orderBy("f1")\
.selectExpr("topTerm","tr_cnt","round(precision,3) as train_precision","round(recall,3) as train_recall","round(f1, 3) as train_f1")

In [25]:
metrics.show(100, False)

+-------------------------------------------------------------+------+---------------+------------+--------+
|topTerm                                                      |tr_cnt|train_precision|train_recall|train_f1|
+-------------------------------------------------------------+------+---------------+------------+--------+
|General clinical state finding (finding)                     |5     |0.8            |0.8         |0.8     |
|Wound finding (finding)                                      |9     |0.8            |0.889       |0.842   |
|Disease (disorder)                                           |325   |0.843          |0.895       |0.869   |
|Finding by method (finding)                                  |20    |0.857          |0.9         |0.878   |
|SNOMED CT Model Component (metadata)                         |14    |0.867          |0.929       |0.897   |
|Finding by site (finding)                                    |722   |0.884          |0.911       |0.898   |
|Clinical history a

In [26]:
with_alternatives = predicted\
    .withColumn("resolution",F.expr("split(substring(snomed_code.metadata[0]['all_k_results'],2,length(snomed_code.metadata[0]['all_k_results'])-2),'\\\\],\\\\[')"))

In [27]:
evaled = with_alternatives\
    .withColumn("good", F.expr("case when conceptId=snomed_code.result[0] then 1 else 0 end"))\
    .withColumn("hat5", F.expr("case when array_contains(slice(resolution, 1, 5), conceptId) then 1 else 0 end"))\
    .withColumn("hat10", F.expr("case when array_contains(slice(resolution, 1, 10), conceptId) then 1 else 0 end"))\
    .withColumn("hat20", F.expr("case when array_contains(slice(resolution, 1, 20), conceptId) then 1 else 0 end"))\
    .withColumn("hat30", F.expr("case when array_contains(slice(resolution, 1, 30), conceptId) then 1 else 0 end"))\
    .withColumn("hat500", F.expr("case when array_contains(slice(resolution, 1, 500), conceptId) then 1 else 0 end"))

In [28]:
evaled.groupby("topTerm").agg(
    F.mean("good"), 
    F.mean("hat5"), 
    F.mean("hat10"), 
    F.mean("hat20"), 
    F.mean("hat30"), 
    F.mean("hat500"), 
    F.count("good")).orderBy("count(good)", ascending=False)\
.selectExpr("topTerm",
            "round(`avg(good)`, 2) as good",
            "round(`avg(hat5)`, 2) as hat5",
            "round(`avg(hat10)`, 2) as hat10",
            "round(`avg(hat20)`, 2) as hat20",
            "round(`avg(hat30)`, 2) as hat30",
            "round(`avg(hat500)`, 2) as hat500",
            "`count(good)` as total")\
.show(100,False)

+-------------------------------------------------------------+----+----+-----+-----+-----+------+-----+
|topTerm                                                      |good|hat5|hat10|hat20|hat30|hat500|total|
+-------------------------------------------------------------+----+----+-----+-----+-----+------+-----+
|Procedure (procedure)                                        |0.94|0.95|0.95 |0.96 |0.96 |0.96  |738  |
|Finding by site (finding)                                    |0.88|0.9 |0.9  |0.9  |0.9  |0.9   |722  |
|Body structure (body structure)                              |0.91|0.93|0.93 |0.93 |0.93 |0.93  |489  |
|Organism (organism)                                          |0.5 |0.57|0.61 |0.66 |0.67 |0.67  |345  |
|Disease (disorder)                                           |0.85|0.87|0.87 |0.88 |0.88 |0.88  |325  |
|Substance (substance)                                        |0.68|0.78|0.79 |0.79 |0.8  |0.8   |307  |
|Clinical history and observation findings (finding)   

In [29]:
print(round((time.time()-start)/60, 2), "minutes")

7.69 minutes
