![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

In [1]:
import json
import os
license_key = "/home/ubuntu/damla/keys/4.1.0.spark_nlp_for_healthcare.json"
with open( license_key) as f:
    license_keys = json.load(f)
    
# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [2]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

from sparknlp_display import NerVisualizer

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 4.1.0
Spark NLP_JSL Version : 4.1.0


In [35]:
text_list = [
"""Efficacy and safety of vildagliptin in patients with type 2 diabetes mellitus inadequately controlled with dual combination of metformin and sulphonylurea . This study assessed the efficacy and safety of vildagliptin as add - on therapy to metformin plus glimepiride combination in patients with type 2 diabetes mellitus ( T2DM ) who had inadequate glycaemic control . A multicentre , double - blind , placebo - controlled study randomized patients to receive treatment with vildagliptin 50   mg bid or placebo for 24   weeks .""",
"""Randomised, multicentre trial comparing insulin glargine with NPH insulin in combination with oral agents in patients with type 2 diabetes. In a multicentre, open, randomised study, 570 patients with Type 2 diabetes, aged 34 - 80 years, were treated for 52 weeks with insulin glargine or NPH insulin given once daily at bedtime.""",
"""Latanoprost administered once daily caused a maintained reduction of intraocular pressure in glaucoma patients treated concomitantly with timolol . A total of 50 patients , 17 with primary open angle glaucoma and 33 with capsular glaucoma. Patients were randomised to two treatment groups . No clinically significant side effects were observed during treatment .Latanoprost causes a marked and sustained IOP reduction in eyes which are also being treated with timolol .""",
"""A randomised crossover study comparing bimatoprost and latanoprost in subjects with primary angle closure glaucoma . This was an observer - masked randomised crossover study of 60 PACG subjects who received either latanoprost or bimatoprost for 6 weeks , after which they were crossed over to the other medication for another 6 weeks . CONCLUSIONS : Bimatoprost once daily was similarly effective in reducing IOP compared with latanoprost once daily in subjects with chronic PACG . Both drugs were well tolerated with mild ocular adverse events .""",
"""Intraocular pressure - lowering efficacy of brinzolamide when added to travoprost / timolol fixed combination as adjunctive therapy .PURPOSE : To compare the efficacy of brinzolamide versus placebo when added to travoprost / timolol fixed combination ( TTFC ) in uncontrolled patients . This was a prospective , double - masked , randomized , placebo - controlled , parallel comparison of ocular hypertensive or primary open - angle glaucoma patients . Patients treated with a adjunctive therapy were changed to TTFC qam ( every day dosing ) for 4 weeks .Patients were then randomized to either placebo or brinzolamide given twice daily in addition to TTFC . At week 12 , patients had their IOP measurements repeated .This study suggests that brinzolamide may be safely added to TTFC therapy to provide further significant reduction in IOP patients with ocular hypertensive or primary open - angle glaucoma ."""
]

In [36]:
model_name = "ner_clinical_trials_abstracts"

In [37]:
document_assembler = DocumentAssembler() \
    .setInputCol('text')\
    .setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols(['document'])\
    .setOutputCol('sentence')

tokenizer = Tokenizer()\
    .setInputCols(['sentence']) \
    .setOutputCol('token')

word_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models') \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('embeddings')

clinical_ner = MedicalNerModel.pretrained(model_name, "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

ner_converter = NerConverter()\
    .setInputCols(['sentence', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

nlp_pipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter
    ])

empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = nlp_pipeline.fit(empty_df)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical_trials_abstracts download started this may take some time.
[OK!]


In [38]:
import pandas as pd
df = spark.createDataFrame(pd.DataFrame({'text': text_list}))

result = nlpPipeline.fit(df).transform(df)

result.select(F.explode(F.arrays_zip("ner_chunk.result", "ner_chunk.metadata")).alias("cols"))\
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+------------------------+------------------+
|chunk                   |ner_label         |
+------------------------+------------------+
|vildagliptin            |Drug              |
|type 2 diabetes mellitus|DisorderOrSyndrome|
|metformin               |Drug              |
|sulphonylurea           |Drug              |
|vildagliptin            |Drug              |
|metformin               |Drug              |
|glimepiride             |Drug              |
|type 2 diabetes mellitus|DisorderOrSyndrome|
|T2DM                    |DisorderOrSyndrome|
|multicentre             |CTDesign          |
|double - blind          |CTDesign          |
|placebo                 |Drug              |
|randomized              |CTDesign          |
|vildagliptin            |Drug              |
|50                      |DoseValue         |
|mg                      |BioAndMedicalUnit |
|bid                     |DrugTime          |
|placebo                 |Drug              |
|24   weeks              |Duration

In [39]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

for i in range(len(text_list)):
  visualiser.display(result = result.collect()[i] ,label_col = 'ner_chunk', document_col = 'document')
  print("\n\n")


























In [40]:
# Creating inputs folders

INPUT_FILE_PATH = f"../inputs/{model_name}/"

!rm -r $INPUT_FILE_PATH
!mkdir -p $INPUT_FILE_PATH

for i, v in enumerate(text_list):
    open(os.path.join(INPUT_FILE_PATH, f"Example{i+1}.txt"), "w", encoding="utf8").write(v[:v.find("\n")] + "\n"+ v)

In [41]:
# Creating output folders

OUTPUT_FILE_PATH = f"../outputs/{model_name}"

!rm -r $OUTPUT_FILE_PATH
!mkdir -p $OUTPUT_FILE_PATH

In [42]:
file_list=sorted(os.listdir(INPUT_FILE_PATH))

In [43]:
result = result.toPandas()

for i in result.index:
    result[['ner_chunk']].iloc[i].to_json(
        os.path.join(OUTPUT_FILE_PATH, file_list[i].split('.')[0]+".json"))