In [1]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.5.1  spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")

    return builder.getOrCreate()

#spark = start(SECRET)

In [4]:
import json
import os

from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

import sparknlp_jsl
import sparknlp

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *


import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", # Amount of memory to use for the driver process, i.e. where SparkContext is initialized
          "spark.kryoserializer.buffer.max":"2000M", # Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified.
          "spark.driver.maxResultSize":"2000M"} # Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes.
                                                # Should be at least 1M, or 0 for unlimited.

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 6.0.2
Spark NLP_JSL Version : 6.0.2


In [5]:
clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [6]:
def replace_entities_with_O(input_path, output_path, blacklist):
    """
    CONLL dosyasındaki blacklist'teki entity'leri 'O' ile değiştirir.
    input_path: Orijinal conll dosya yolu
    output_path: Sonuç dosyasının yolu
    blacklist: ['LOCATION', 'ROUTE', ...] gibi entity isimleri listesi
    """
    with open(input_path, "r", encoding="utf-8") as infile, \
         open(output_path, "w", encoding="utf-8") as outfile:
        for line in infile:
            # Boş satır veya başlık satırı ise aynen yaz
            if line.strip() == "" or line.startswith("-DOCSTART-"):
                outfile.write(line)
                continue

            parts = line.strip().split()
            if not parts:
                outfile.write(line)
                continue

            # Son sütun etikettir (B-LOCATION, I-ROUTE, O, vs.)
            label = parts[-1]
            # Sadece entity kısmını al (B-LOCATION -> LOCATION, I-ROUTE -> ROUTE)
            entity = label.split("-")[-1] if "-" in label else label

            if entity in blacklist and label != "O":
                parts[-1] = "O"
            outfile.write(" ".join(parts) + "\n")

In [7]:
replace_entities_with_O(r"/content/conll2003_text_file.conll",r"/content/filter_conll_file.rtf",["LOCATION","DURATION","ID", "FORM","DOSAGE","ROUTE", "FREQUENCY", "STRENGTH", "PROFESSION", "AGE"])

In [8]:
from pyspark.sql import functions as F
from sparknlp.training import CoNLL

data = CoNLL().readDataset(spark, '/content/filter_conll_file.rtf')

(data_train, data_test) = data.randomSplit([0.8, 0.2], seed = 100)
data_test = data_test.coalesce(1).withColumn("idx", F.monotonically_increasing_id())
data_train = data_train.coalesce(1).withColumn("idx", F.monotonically_increasing_id())

In [9]:
clinical_embeddings.transform(data_test).write.parquet('data_test.parquet')

clinical_embeddings.transform(data_train).write.parquet('data_train.parquet')

In [10]:
!pip install numpy==1.23.5
!pip install tensorflow==2.12.0
!pip install tensorflow-addons==0.22.0

Collecting tensorflow==2.12.0
  Using cached tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Using cached gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Using cached keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.12.0)
  Using cached protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12.0)
  Using cached tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Using cached tensorflow_estimator-2.12.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.12.0)
  Using cached wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_

Collecting tensorflow-addons==0.22.0
  Downloading tensorflow_addons-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons==0.22.0)
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading tensorflow_addons-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.3/612.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
  Attempting uninstall: typeguard
    Found existing installation: typeguard 4.4.4
    Uninstalling typeguard-4.4.4:
      Successfully uninstalled typeguard-4.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
inflect 7.5.0 requires ty

In [11]:
from sparknlp_jsl.annotator import TFGraphBuilder

In [12]:
graph_folder_path = "medical_ner_graphs"

ner_graph_builder = TFGraphBuilder()\
    .setModelName("ner_dl")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("label")\
    .setGraphFolder(graph_folder_path)\
    .setGraphFile("auto")\
    .setHiddenUnitsNumber(24)\
    .setIsLicensed(True) # False -> if you want to use TFGraphBuilder with NerDLApproach


In [13]:

custom_ner_model = MedicalNerApproach()\
      .setInputCols(["sentence", "token", "embeddings"])\
      .setLabelColumn("label")\
      .setOutputCol("ner")\
      .setMaxEpochs(20)\
      .setLr(0.003)\
      .setBatchSize(10)\
      .setRandomSeed(0)\
      .setVerbose(1)\
      .setEvaluationLogExtended(True) \
      .setEnableOutputLogs(True)\
      .setIncludeConfidence(True)\
      .setTestDataset('/content/data_train.parquet')\
      .setGraphFolder(graph_folder_path)\
      .setOutputLogsPath('./ner_logs')\




ner_pipeline = Pipeline(stages=[
      clinical_embeddings,
      ner_graph_builder,
      custom_ner_model
 ])

In [14]:
%%time
ner_model_custom = ner_pipeline.fit(data_train)

TF Graph Builder configuration:
Model name: ner_dl
Graph folder: medical_ner_graphs
Graph file name: auto
Build params: {'ntags': 13, 'embeddings_dim': 200, 'nchars': 78, 'is_medical': True, 'lstm_size': 24}


Instructions for updating:
non-resource variables are not supported in the long term
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


ner_dl graph exported to medical_ner_graphs/blstm_13_200_24_78.pb
CPU times: user 21.6 s, sys: 1.75 s, total: 23.3 s
Wall time: 14min 11s


In [15]:
from sparknlp_jsl.eval import NerDLMetrics
import pyspark.sql.functions as F

pred_df = ner_model_custom.stages[2].transform(clinical_embeddings.transform(data_train))

evaler = NerDLMetrics(mode="full_chunk")

eval_result = evaler.computeMetricsFromDF(pred_df.select("label","ner"), prediction_col="ner", label_col="label", drop_o = True, case_sensitive = True).cache()

eval_result.withColumn("precision", F.round(eval_result["precision"],4))\
           .withColumn("recall", F.round(eval_result["recall"],4))\
           .withColumn("f1", F.round(eval_result["f1"],4)).show(100)

print(eval_result.selectExpr("avg(f1) as macro").show())
print (eval_result.selectExpr("sum(f1*total) as sumprod","sum(total) as sumtotal").selectExpr("sumprod/sumtotal as micro").show())

+---------+-----+----+----+-----+---------+------+------+
|   entity|   tp|  fp|  fn|total|precision|recall|    f1|
+---------+-----+----+----+-----+---------+------+------+
|  PROBLEM|598.0|36.0|11.0|609.0|   0.9432|0.9819|0.9622|
|TREATMENT|348.0|58.0|21.0|369.0|   0.8571|0.9431|0.8981|
|     TEST|166.0| 5.0|28.0|194.0|   0.9708|0.8557|0.9096|
|     DATE|449.0| 0.0| 6.0|455.0|      1.0|0.9868|0.9934|
|     DRUG|187.0| 4.0|31.0|218.0|   0.9791|0.8578|0.9144|
|     NAME| 61.0| 0.0| 5.0| 66.0|      1.0|0.9242|0.9606|
+---------+-----+----+----+-----+---------+------+------+

+------------------+
|             macro|
+------------------+
|0.9397099987399476|
+------------------+

None
+------------------+
|             micro|
+------------------+
|0.9463867785564776|
+------------------+

None
