In [1]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.5.1  spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.9/709.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m550.2/550.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")

    return builder.getOrCreate()

#spark = start(SECRET)

In [4]:
import json
import os

from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

import sparknlp_jsl
import sparknlp

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *


import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", # Amount of memory to use for the driver process, i.e. where SparkContext is initialized
          "spark.kryoserializer.buffer.max":"2000M", # Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified.
          "spark.driver.maxResultSize":"2000M"} # Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes.
                                                # Should be at least 1M, or 0 for unlimited.

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 6.0.2
Spark NLP_JSL Version : 6.0.2


In [5]:
clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [6]:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, '/content/conll2003_text_file.conll')

training_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sample Type / Med...|[{document, 0, 26...|[{document, 0, 26...|[{token, 0, 5, Sa...|[{pos, 0, 5, NN, ...|[{named_entity, 0...|
|( Medical Transcr...|[{document, 0, 76...|[{document, 0, 76...|[{token, 0, 0, (,...|[{pos, 0, 0, NN, ...|[{named_entity, 0...|
|SECONDARY DIAGNOS...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 8, SE...|[{pos, 0, 8, NN, ...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [7]:
# NerConverter ile named_entity'yi chunk'a çevir
ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "label"]) \
    .setOutputCol("ner_chunk")\
    .setBlackList(["DURATION","ROUTE","FREQUENCY","STRENGTH","PROFESSION","LOCATION","AGE"])

pipeline = Pipeline(stages=[ner_converter])

result = pipeline.fit(training_data).transform(training_data)


In [8]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|           ner_chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sample Type / Med...|[{document, 0, 26...|[{document, 0, 26...|[{token, 0, 5, Sa...|[{pos, 0, 5, NN, ...|[{named_entity, 0...|[{chunk, 121, 131...|
|( Medical Transcr...|[{document, 0, 76...|[{document, 0, 76...|[{token, 0, 0, (,...|[{pos, 0, 0, NN, ...|[{named_entity, 0...|[{chunk, 62, 72, ...|
|SECONDARY DIAGNOS...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 8, SE...|[{pos, 0, 8, NN, ...|[{named_entity, 0...|[{chunk, 22, 36, ...|
|          PROCEDURES|[{document, 0, 9,...|[{document, 0, 9,...|[{token, 0, 9, PR...|[{pos, 0, 9, NN, ...|

In [9]:
iobTagger = IOBTagger()\
  .setInputCols(["token", "ner_chunk"])\
  .setOutputCol("ner_label")

pipeline = Pipeline(stages=[iobTagger])


result_iob_tagger = pipeline.fit(result).transform(result)

In [10]:
result_iob_tagger.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|           ner_chunk|           ner_label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sample Type / Med...|[{document, 0, 26...|[{document, 0, 26...|[{token, 0, 5, Sa...|[{pos, 0, 5, NN, ...|[{named_entity, 0...|[{chunk, 121, 131...|[{named_entity, 0...|
|( Medical Transcr...|[{document, 0, 76...|[{document, 0, 76...|[{token, 0, 0, (,...|[{pos, 0, 0, NN, ...|[{named_entity, 0...|[{chunk, 62, 72, ...|[{named_entity, 0...|
|SECONDARY DIAGNOS...|[{document, 0, 14...|[{document, 0, 14...|[{token, 0, 8, SE...|[{pos, 0, 8, NN, ...|[{named_entity, 0...|[{chunk, 22, 36, ...|[{

In [11]:
result_iob_tagger.selectExpr("explode(ner_chunk) as a") \
  .selectExpr("a.begin",
              "a.end",
              "a.result as ner_chunk",
              "a.metadata.entity as ner_label").show(50, False)

+-----+---+----------------------------------+---------+
|begin|end|ner_chunk                         |ner_label|
+-----+---+----------------------------------+---------+
|121  |131|Mesotheliom                       |PROBLEM  |
|137  |151|pleural effusio                   |PROBLEM  |
|157  |174|atrial fibrillatio                |PROBLEM  |
|180  |184|anemi                             |PROBLEM  |
|190  |195|ascite                            |PROBLEM  |
|201  |216|esophageal reflu                  |PROBLEM  |
|237  |257|deep venous thrombosi             |PROBLEM  |
|62   |72 |Mesotheliom                       |PROBLEM  |
|22   |36 |Pleural effusio                   |PROBLEM  |
|42   |59 |atrial fibrillatio                |PROBLEM  |
|65   |69 |anemi                             |PROBLEM  |
|75   |80 |ascite                            |PROBLEM  |
|86   |101|esophageal reflu                  |PROBLEM  |
|122  |142|deep venous thrombosi             |PROBLEM  |
|7    |21 |August 24 , 200     

In [12]:
row_count = result_iob_tagger.count()

In [13]:
result_iob_tagger.selectExpr("explode(ner_label) as a") \
  .selectExpr("a.begin",
              "a.end",
              "a.metadata.word as word",
              "a.result as chunk").show(row_count, truncate=False)

+-----+---+--------------+-----------+
|begin|end|word          |chunk      |
+-----+---+--------------+-----------+
|0    |5  |Sample        |O          |
|7    |10 |Type          |O          |
|12   |12 |/             |O          |
|14   |20 |Medical       |O          |
|22   |30 |Specialty     |O          |
|32   |32 |:             |O          |
|34   |43 |Hematology    |O          |
|45   |45 |-             |O          |
|47   |54 |Oncology      |O          |
|56   |61 |Sample        |O          |
|63   |66 |Name          |O          |
|68   |68 |:             |O          |
|70   |78 |Discharge     |O          |
|80   |86 |Summary       |O          |
|88   |88 |-             |O          |
|90   |101|Mesothelioma  |O          |
|103  |103|-             |O          |
|105  |105|1             |O          |
|107  |117|Description   |O          |
|119  |119|:             |O          |
|121  |131|Mesotheliom   |B-PROBLEM  |
|133  |133|a             |O          |
|135  |135|,             

In [14]:
(training_data, test_data) = result_iob_tagger.randomSplit([0.8, 0.2], seed = 100)

# save the test data as parquet for easy testing
clinical_embeddings.transform(training_data).write.parquet('training_data.parquet')

clinical_embeddings.transform(test_data).write.parquet('test_data.parquet')

In [15]:
!pip install numpy==1.23.5
!pip install tensorflow==2.12.0
!pip install tensorflow-addons==0.22.0

Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.12.0)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12.0)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.12.0)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.

Collecting tensorflow-addons==0.22.0
  Downloading tensorflow_addons-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons==0.22.0)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading tensorflow_addons-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.3/612.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.4.4
    Uninstalling typeguard-4.4.4:
      Successfully uninstalled typeguard-4.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
inflect 7.5.0 requires typ

In [16]:
from sparknlp_jsl.annotator import TFGraphBuilder

In [17]:
graph_folder_path = "medical_ner_graphs"

ner_graph_builder = TFGraphBuilder()\
    .setModelName("ner_dl")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("label")\
    .setGraphFolder(graph_folder_path)\
    .setGraphFile("auto")\
    .setHiddenUnitsNumber(24)\
    .setIsLicensed(True) # False -> if you want to use TFGraphBuilder with NerDLApproach


In [18]:

custom_ner_model = MedicalNerApproach()\
      .setInputCols(["sentence", "token", "embeddings"])\
      .setLabelColumn("ner_label")\
      .setOutputCol("ner")\
      .setMaxEpochs(20)\
      .setLr(0.003)\
      .setBatchSize(10)\
      .setRandomSeed(0)\
      .setVerbose(1)\
      .setEvaluationLogExtended(True) \
      .setEnableOutputLogs(True)\
      .setIncludeConfidence(True)\
      .setTestDataset('/content/test_data.parquet')\
      .setGraphFolder(graph_folder_path)\
      .setOutputLogsPath('./ner_logs')\




ner_pipeline = Pipeline(stages=[
      clinical_embeddings,
      ner_graph_builder,
      custom_ner_model
 ])

In [19]:
%%time
ner_model_custom = ner_pipeline.fit(training_data)

TF Graph Builder configuration:
Model name: ner_dl
Graph folder: medical_ner_graphs
Graph file name: auto
Build params: {'ntags': 31, 'embeddings_dim': 200, 'nchars': 78, 'is_medical': True, 'lstm_size': 24}


Instructions for updating:
non-resource variables are not supported in the long term
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


ner_dl graph exported to medical_ner_graphs/blstm_31_200_24_78.pb
CPU times: user 20 s, sys: 1.64 s, total: 21.6 s
Wall time: 8min 29s


In [20]:
from sparknlp_jsl.eval import NerDLMetrics
import pyspark.sql.functions as F

pred_df = ner_model_custom.stages[2].transform(clinical_embeddings.transform(training_data))

evaler = NerDLMetrics(mode="full_chunk")

eval_result = evaler.computeMetricsFromDF(
    pred_df.select("label", "ner"),
    prediction_col="ner",
    label_col="label",
    drop_o=True,
    case_sensitive=True
).cache()

# Sıfır f1 skoru olan entity'leri filtrele
filtered_eval_result = eval_result.filter(eval_result.f1 > 0.00001)

filtered_eval_result = filtered_eval_result.withColumn("precision", F.round(filtered_eval_result["precision"], 4)) \
                                           .withColumn("recall", F.round(filtered_eval_result["recall"], 4)) \
                                           .withColumn("f1", F.round(filtered_eval_result["f1"], 4))

filtered_eval_result.show(100)

print(filtered_eval_result.selectExpr("avg(f1) as macro").show())
print(filtered_eval_result.selectExpr("sum(f1*total) as sumprod", "sum(total) as sumtotal").selectExpr("sumprod/sumtotal as micro").show())

+---------+-----+----+----+-----+---------+------+------+
|   entity|   tp|  fp|  fn|total|precision|recall|    f1|
+---------+-----+----+----+-----+---------+------+------+
|     NAME| 61.0| 0.0| 5.0| 66.0|      1.0|0.9242|0.9606|
|  PROBLEM|599.0|34.0|10.0|609.0|   0.9463|0.9836|0.9646|
|     DATE|454.0| 2.0| 1.0|455.0|   0.9956|0.9978|0.9967|
|     DRUG|191.0| 2.0|27.0|218.0|   0.9896|0.8761|0.9294|
|TREATMENT|357.0|58.0|12.0|369.0|   0.8602|0.9675|0.9107|
|     TEST|163.0| 5.0|31.0|194.0|   0.9702|0.8402|0.9006|
+---------+-----+----+----+-----+---------+------+------+

+------------------+
|             macro|
+------------------+
|0.9437666666666668|
+------------------+

None
+------------------+
|             micro|
+------------------+
|0.9511844060701203|
+------------------+

None
