![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/16.Adverse_Drug_Event_ADE_NER_and_Classifier.ipynb)

# Adverse Drug Event (ADE) Pretrained NER and Classifier Models

In [None]:
import json, os
#from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

Saving jsl_keys - Copy.json to jsl_keys - Copy.json


In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.4/212.4 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m473.2/473.2 KB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 KB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.8/427.8 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import json

import pandas as pd
from sklearn.metrics import classification_report

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline,PipelineModel

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'], params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.3.2
Spark NLP_JSL Version : 4.3.2


In [None]:
ade_df= pd.read_csv("/content/ad_df_200rows.csv")[["description", "category"]]
ade_df.columns= ["text", "category"]
ade_df.head()

Unnamed: 0,text,category
0,No objective data have been presented examini...,NEGATIVE
1,Both patients subsequently developed markedly...,NEGATIVE
2,"In turn, it may cause inflammation in such ra...",NEGATIVE
3,"She was stabilized with prednisone, azathiopr...",NEGATIVE
4,MATERIALS AND METHODS: Between March 1997 and...,NEGATIVE


In [None]:
ade_df.category.value_counts()

NEGATIVE    100
POSITIVE    100
Name: category, dtype: int64

In [None]:
ade_df.category = ade_df.category.replace({"NEGATIVE":"False","POSITIVE":"True"})
ade_df.head()

Unnamed: 0,text,category
0,No objective data have been presented examini...,False
1,Both patients subsequently developed markedly...,False
2,"In turn, it may cause inflammation in such ra...",False
3,"She was stabilized with prednisone, azathiopr...",False
4,MATERIALS AND METHODS: Between March 1997 and...,False


In [None]:
testData = spark.createDataFrame(ade_df)

## ADE Classifier 

ADE Classifier Classifies if a sentence is ADE-related (`True`) or not (`False`)

`True` : The sentence is talking about a possible ADE

`False` : The sentences doesn't have any information about an ADE.


|index |model |Predicted Entities|
|-----:|:-----|:----------------:|
| 1| [classifierdl_ade_biobert](https://nlp.johnsnowlabs.com/2021/01/21/classifierdl_ade_biobert_en.html)   |True, False|
| 2| [classifierdl_ade_clinicalbert](https://nlp.johnsnowlabs.com/2021/01/21/classifierdl_ade_clinicalbert_en.html)  |True, False|
| 3| [classifierdl_ade_conversational_biobert](https://nlp.johnsnowlabs.com/2021/01/21/classifierdl_ade_conversational_biobert_en.html)  |True, False|
| 4| [bert_sequence_classifier_ade](https://nlp.johnsnowlabs.com/2022/02/08/bert_sequence_classifier_ade_en.html)  |True, False|
| 5| [bert_sequence_classifier_ade_augmented](https://nlp.johnsnowlabs.com/2022/07/27/bert_sequence_classifier_ade_augmented_en_3_0.html)   |ADE, noADE|
| 6| [distilbert_sequence_classifier_ade](https://nlp.johnsnowlabs.com/2022/02/08/distilbert_sequence_classifier_ade_en.html)  |True, False|



### ADE clinical bert

In [None]:
result={}

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = BertEmbeddings.pretrained('biobert_clinical_base_cased')\
.setInputCols(["sentence", 'token'])\
.setOutputCol("word_embeddings")

sentence_embeddings = SentenceEmbeddings() \
.setInputCols(["sentence", "word_embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")

classifier = ClassifierDLModel.pretrained('classifierdl_ade_clinicalbert', 'en', 'clinical/models')\
.setInputCols(['sentence_embeddings']).setOutputCol('class')

ade_clf_pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings, sentence_embeddings, classifier])

empty_data = spark.createDataFrame([[""]]).toDF("text")

ade_clf_model = ade_clf_pipeline.fit(empty_data)

biobert_clinical_base_cased download started this may take some time.
Approximate size to download 386.6 MB
[OK!]
classifierdl_ade_clinicalbert download started this may take some time.
Approximate size to download 21.8 MB
[OK!]


In [None]:
preds = ade_clf_pipeline.fit(testData).transform(testData)
preds_df = preds.select("category","text","class.result").toPandas()
preds_df["result"] = preds_df["result"].apply(lambda x : x[0])

In [None]:
preds_df.head()

Unnamed: 0,category,text,result
0,False,No objective data have been presented examini...,False
1,False,Both patients subsequently developed markedly...,False
2,False,"In turn, it may cause inflammation in such ra...",False
3,False,"She was stabilized with prednisone, azathiopr...",False
4,False,MATERIALS AND METHODS: Between March 1997 and...,False


In [None]:
clf_res = classification_report(preds_df["category"], preds_df["result"],output_dict=True)
clf_res['True'].pop("support")
result["biobert_clinical_base_cased"] = clf_res['True']
result["biobert_clinical_base_cased"]["accuracy-general"] = clf_res["accuracy"]
pd.DataFrame(result)

Unnamed: 0,biobert_clinical_base_cased
accuracy-general,0.805
f1-score,0.769231
precision,0.942029
recall,0.65


In [None]:
print (classification_report(preds_df["category"], preds_df["result"]))

              precision    recall  f1-score   support

       False       0.73      0.96      0.83       100
        True       0.94      0.65      0.77       100

    accuracy                           0.81       200
   macro avg       0.84      0.80      0.80       200
weighted avg       0.84      0.81      0.80       200



### ADE Classifier with BioBert

In [None]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

bert_embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")\
    .setMaxSentenceLength(512)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["sentence", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")\
    .setStorageRef('biobert_pubmed_base_cased')

classsifierdl = ClassifierDLModel.pretrained("classifierdl_ade_biobert", "en", "clinical/models")\
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class")

ade_clf_pipeline = Pipeline(
    stages=[documentAssembler, 
            tokenizer,
            bert_embeddings,
            embeddingsSentence,
            classsifierdl])


empty_data = spark.createDataFrame([[""]]).toDF("text")

ade_clf_model = ade_clf_pipeline.fit(empty_data)


biobert_pubmed_base_cased download started this may take some time.
Approximate size to download 386.4 MB
[OK!]
classifierdl_ade_biobert download started this may take some time.
Approximate size to download 21.8 MB
[OK!]


In [None]:
preds = ade_clf_pipeline.fit(testData).transform(testData)
preds_df = preds.select("category","text","class.result").toPandas()
preds_df["result"] = preds_df["result"].apply(lambda x : x[0])

In [None]:
preds_df.head()

Unnamed: 0,category,text,result
0,False,No objective data have been presented examini...,False
1,False,Both patients subsequently developed markedly...,False
2,False,"In turn, it may cause inflammation in such ra...",False
3,False,"She was stabilized with prednisone, azathiopr...",False
4,False,MATERIALS AND METHODS: Between March 1997 and...,False


In [None]:
clf_res = classification_report(preds_df["category"], preds_df["result"],output_dict=True)
clf_res['True'].pop("support")
result["classifierdl_ade_biobert"] = clf_res['True']
result["classifierdl_ade_biobert"]["accuracy-general"] = clf_res["accuracy"]
pd.DataFrame(result)

Unnamed: 0,biobert_clinical_base_cased,classifierdl_ade_biobert
precision,0.942029,0.935897
recall,0.65,0.73
f1-score,0.769231,0.820225
accuracy-general,0.805,0.84


In [None]:
print (classification_report(preds_df["category"], preds_df["result"]))

              precision    recall  f1-score   support

       False       0.78      0.95      0.86       100
        True       0.94      0.73      0.82       100

    accuracy                           0.84       200
   macro avg       0.86      0.84      0.84       200
weighted avg       0.86      0.84      0.84       200



### ADE Classifier trained with conversational (short) sentences

In [None]:
conv_classsifierdl = ClassifierDLModel.pretrained("classifierdl_ade_conversational_biobert", "en", "clinical/models")\
            .setInputCols(["sentence_embeddings"]) \
            .setOutputCol("class")

conv_ade_clf_pipeline = Pipeline(
    stages=[documentAssembler, 
            tokenizer,
            bert_embeddings,
            embeddingsSentence,
            conv_classsifierdl])
empty_data = spark.createDataFrame([[""]]).toDF("text")

conv_ade_clf_model = ade_clf_pipeline.fit(empty_data)

classifierdl_ade_conversational_biobert download started this may take some time.
Approximate size to download 21.8 MB
[OK!]


In [None]:
preds = conv_ade_clf_model.transform(testData)
preds_df = preds.select("category","text","class.result").toPandas()
preds_df["result"] = preds_df["result"].apply(lambda x : x[0])

In [None]:
clf_res = classification_report(preds_df["category"], preds_df["result"],output_dict=True)
clf_res['True'].pop("support")
result["classifierdl_ade_conversational_biobert"] = clf_res['True']
result["classifierdl_ade_conversational_biobert"]["accuracy-general"] = clf_res["accuracy"]
pd.DataFrame(result)

Unnamed: 0,biobert_clinical_base_cased,classifierdl_ade_biobert,classifierdl_ade_conversational_biobert
precision,0.942029,0.935897,0.935897
recall,0.65,0.73,0.73
f1-score,0.769231,0.820225,0.820225
accuracy-general,0.805,0.84,0.84


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print (classification_report(preds_df["category"], preds_df["result"]))

              precision    recall  f1-score   support

       False       0.78      0.95      0.86       100
        True       0.94      0.73      0.82       100

    accuracy                           0.84       200
   macro avg       0.86      0.84      0.84       200
weighted avg       0.86      0.84      0.84       200



### ADE Sequence Classifier

MedicalDistilBertForSequenceClassification

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
    

sequenceClassifier = MedicalDistilBertForSequenceClassification.pretrained("distilbert_sequence_classifier_ade", "en", "clinical/models")\
    .setInputCols(["document","token"])\
    .setOutputCol("class")
 
ade_clf_pipeline = Pipeline(stages=[
                    document_assembler, 
                    tokenizer,
                    sequenceClassifier])


empty_data = spark.createDataFrame([[""]]).toDF("text")

ade_clf_model = ade_clf_pipeline.fit(empty_data)

distilbert_sequence_classifier_ade download started this may take some time.
[OK!]


In [None]:
preds = ade_clf_model.transform(testData)
preds_df = preds.select("category","text","class.result").toPandas()
preds_df["result"] = preds_df["result"].apply(lambda x : x[0])

print (classification_report(preds_df["category"], preds_df["result"]))

              precision    recall  f1-score   support

       False       0.82      0.97      0.89       100
        True       0.96      0.78      0.86       100

    accuracy                           0.88       200
   macro avg       0.89      0.88      0.87       200
weighted avg       0.89      0.88      0.87       200



In [None]:
clf_res = classification_report(preds_df["category"], preds_df["result"],output_dict=True)
clf_res['True'].pop("support")
result["distilbert_sequence_classifier_ade"] = clf_res['True']
result["distilbert_sequence_classifier_ade"]["accuracy-general"] = clf_res["accuracy"]
pd.DataFrame(result)

Unnamed: 0,biobert_clinical_base_cased,classifierdl_ade_biobert,classifierdl_ade_conversational_biobert,distilbert_sequence_classifier_ade
precision,0.942029,0.935897,0.935897,0.962963
recall,0.65,0.73,0.73,0.78
f1-score,0.769231,0.820225,0.820225,0.861878
accuracy-general,0.805,0.84,0.84,0.875


MedicalBertForSequenceClassification

In [None]:
clf_res = classification_report(preds_df["category"], preds_df["result"],output_dict=True)
clf_res['ADE'].pop("support")
result["bert_sequence_classifier_ade_augmented"] = clf_res['ADE']
result["bert_sequence_classifier_ade_augmented"]["accuracy-general"] = clf_res["accuracy"]
pd.DataFrame(result)

Unnamed: 0,biobert_clinical_base_cased,classifierdl_ade_biobert,classifierdl_ade_conversational_biobert,distilbert_sequence_classifier_ade,bert_sequence_classifier_ade_augmented
precision,0.942029,0.935897,0.935897,0.962963,0.515464
recall,0.65,0.73,0.73,0.78,1.0
f1-score,0.769231,0.820225,0.820225,0.861878,0.680272
accuracy-general,0.805,0.84,0.84,0.875,0.53


In [None]:
df_result = pd.DataFrame(result)
df_result.to_csv("scores_ad_df200rows.csv")