#  Adverse Drug Event Classifier


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/clinical_text_classification/2.Adverse_Effect_Classification.ipynb)

In [1]:
import json

with open('workshop_license_keys_365.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['PUBLIC_VERSION', 'JSL_VERSION', 'SECRET', 'SPARK_NLP_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET'])

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

secret = license_keys['SECRET']

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
version = license_keys['PUBLIC_VERSION']
jsl_version = license_keys['JSL_VERSION']

! pip install --ignore-installed -q pyspark==2.4.4

! python -m pip install --upgrade spark-nlp-jsl==$jsl_version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

! pip install --ignore-installed -q spark-nlp==$version

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp.base import *

In [3]:
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.common import *
spark = sparknlp.start()

In [4]:
import pandas as pd

ade_df = pd.read_csv('ade_binary.csv')
ade_df.columns=['PMID','description','category']

ade_df.head()

Unnamed: 0,PMID,description,category
0,20178709,A case of aseptic pleuropericarditis in a pati...,AE
1,20178709,"Methotrexate may rarely provoke serositis, eve...",AE
2,20178709,We report here a rare case of pleuropericardit...,AE
3,20178709,The effusion resolved after the withdrawal of ...,Neg
4,7957364,Teratogenic effects in a case of maternal trea...,Neg


In [7]:
ade_df[ade_df['description'].str.contains('BACKGROUND: External beam radiation therapy of.')]

Unnamed: 0,PMID,description,category
15002,8402502,BACKGROUND: External beam radiation therapy of...,Neg


In [8]:
ade_df['description'][15002]

'BACKGROUND: External beam radiation therapy often is avoided in the treatment of rhabdomyosarcoma (RMS) in young children because of the long-term sequelae.'

In [None]:
ade_df.category.value_counts()

Neg    16815
AE      4262
Name: category, dtype: int64

In [9]:
ade_df2['text'][2]

'BACKGROUND: External beam radiation therapy often is avoided in the treatment of rhabdomyosarcoma (RMS) in young children because of the long-term sequelae.'

In [None]:
ade_df2 = pd.read_csv('ADE_classification_dataset_0908.csv')
#ade_df2.columns=['PMID','description','category']

ade_df2.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,Clioquinol intoxication occurring in the treat...,positive
1,1,"""Retinoic acid syndrome"" was prevented with sh...",positive
2,2,BACKGROUND: External beam radiation therapy of...,positive
3,3,"Although the enuresis ceased, she developed th...",positive
4,4,A 42-year-old woman had uneventful bilateral l...,positive


In [6]:
ade_df2.label.value_counts()

positive    16625
negative     4271
Name: label, dtype: int64

In [10]:
spark_df = spark.createDataFrame(ade_df)

(trainingData, testData) = spark_df.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 16862
Test Dataset Count: 4215


## Text cleaning in Spark NLP and CV + LogReg with Spark ML

In [None]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
           countVectors,
           label_stringIdx])

nlp_model = nlp_pipeline.fit(spark_df)

processed = nlp_model.transform(spark_df)

processed.count()

In [None]:
(trainingData, testData) = processed.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 1) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Prolonged responses were ac...|      AE|[0.4991510707783345,0.50084...|  1.0|       1.0|
|Renal failure associated wi...|      AE|[0.4979399562669633,0.50206...|  1.0|       1.0|
|These case reports provide ...|      AE|[0.49751709783971115,0.5024...|  1.0|       1.0|
|We describe a patient with ...|      AE|[0.49728770031579433,0.5027...|  1.0|       1.0|
|An adverse drug reaction (A...|     Neg|[0.4969665704009515,0.50303...|  0.0|       1.0|
|Insulin-induced lipoatrophy...|      AE|[0.49694014629365413,0.5030...|  1.0|       1.0|
|Insulin-induced cardiac fai...|      AE|[0.49693765166754705,0.5030...|  1.0|       1.0|
|The incidence of RS among c...|     Neg|[0.4964544530237935,0.50354...|  0.0|       1.0|
|We descri

In [None]:
preds_df = predictions.select('category','description',"prediction",'label').toPandas()

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.83      0.99      0.90      3341
         1.0       0.83      0.21      0.34       881

    accuracy                           0.83      4222
   macro avg       0.83      0.60      0.62      4222
weighted avg       0.83      0.83      0.78      4222



## Text cleaning in Spark NLP and TfIDF + LogReg with Spark ML


In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
           idf,
           label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(spark_df)

processed_tf = nlp_model_tf.transform(spark_df)

processed_tf.count()


21077

In [None]:
(trainingData, testData) = processed_tf.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 16855
Test Dataset Count: 4222


In [None]:

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Final treatment plans for t...|     Neg|[0.9950398737908789,0.00496...|  0.0|       0.0|
|To know the histochemical e...|     Neg|[0.9923850017908431,0.00761...|  0.0|       0.0|
|By searching for the key wo...|     Neg|[0.9882074032992472,0.01179...|  0.0|       0.0|
|The diagnosis was made on t...|     Neg|[0.9850888810925544,0.01491...|  0.0|       0.0|
|Moreover, this case illustr...|     Neg|[0.9840292455069913,0.01597...|  0.0|       0.0|
|Pediatric and cardiovascula...|     Neg|[0.9818585325650826,0.01814...|  0.0|       0.0|
|In conclusion, the in vitro...|     Neg|[0.9810434185311695,0.01895...|  0.0|       0.0|
|Calcimimetics are a new cla...|     Neg|[0.9809039513660356,0.01909...|  0.0|       0.0|
|She was p

In [None]:
preds_df = predictions.select('category','description',"prediction",'label').toPandas()

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.82      0.99      0.90      3341
         1.0       0.82      0.16      0.27       881

    accuracy                           0.82      4222
   macro avg       0.82      0.58      0.58      4222
weighted avg       0.82      0.82      0.77      4222



## Text cleaning and featurizing in Spark NLP and LogReg with Spark ML


In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models')\
          .setInputCols(["document", "cleanTokens"])\
          .setOutputCol("embeddings")\
        .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_w2v = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            clinical_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
           label_stringIdx])

nlp_model_w2v = nlp_pipeline_w2v.fit(spark_df)

processed_w2v = nlp_model_w2v.transform(spark_df)

processed_w2v.count()


21077

In [None]:
(trainingData, testData) = processed_w2v.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 16855
Test Dataset Count: 4222


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)
testData.count()

4218

In [None]:
trainingData = trainingData.where(num_nonzeros("features") != 0)
trainingData.count()

16837

In [None]:

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions\
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Stool specimen was streaked...|     Neg|[0.9936647913503851,0.00633...|  0.0|       0.0|
|The remainder of the proced...|     Neg|[0.9873591734885967,0.01264...|  0.0|       0.0|
| Phone followup was attempted.|     Neg|[0.9855274628487091,0.01447...|  0.0|       0.0|
|According to American Socie...|     Neg|[0.9848900941325706,0.01510...|  0.0|       0.0|
|The catheter was removed an...|     Neg|[0.9839529110339125,0.01604...|  0.0|       0.0|
|All of the studies were car...|     Neg|[0.9829156330527493,0.01708...|  0.0|       0.0|
|Chest drainage was required...|     Neg|[0.982723987355742,0.017276...|  0.0|       0.0|
|In December 1998, he applie...|     Neg|[0.9824635580152413,0.01753...|  0.0|       0.0|
|Diagnosis

In [None]:
preds_df = predictions.select('category','description',"prediction",'label').toPandas()

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.82      0.98      0.89      3375
         1.0       0.63      0.11      0.19       843

    accuracy                           0.81      4218
   macro avg       0.72      0.55      0.54      4218
weighted avg       0.78      0.81      0.75      4218



## Text cleaning, featurizing and classification in Spark NLP


In [None]:
!wget https://raw.githubusercontent.com/kavgan/clinical-concepts/master/clinical-stopwords.txt

--2020-04-28 01:47:52--  https://raw.githubusercontent.com/kavgan/clinical-concepts/master/clinical-stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.36.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6582 (6.4K) [text/plain]
Saving to: ‘clinical-stopwords.txt’


2020-04-28 01:47:52 (10.9 MB/s) - ‘clinical-stopwords.txt’ saved [6582/6582]



In [None]:
with open ('clinical-stopwords.txt', 'r') as f:
    stops = f.readlines()

stops = [s.strip() for s in stops[1:]]
stops[:3]

['x', 'y', 'your']

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setStopWords(stops)\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models')\
          .setInputCols(["document", "lemma"])\
          .setOutputCol("embeddings")\
.setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(10)\
  .setBatchSize(8)\
  .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            clinical_embeddings,
            embeddingsSentence,
            classsifierdl])



lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [None]:
%%time

clf_pipelineModel = clf_pipeline.fit(trainingData)

CPU times: user 133 ms, sys: 32.8 ms, total: 166 ms
Wall time: 1min 53s


In [None]:
!cd ~/annotator_logs/ && ls -lt

In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_65b53e5737e5.log 


Training started - total epochs: 10 - learning rate: 0.005 - batch size: 8 - training examples: 16855
Epoch 0/10 - 11.451147705%.2fs - loss: 1093.3795 - accuracy: 0.7997812 - batches: 2107
Epoch 1/10 - 10.773950661%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 2/10 - 10.779547923%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 3/10 - 10.760296186%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 4/10 - 10.870541052%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 5/10 - 9.632701774%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 6/10 - 9.310860832%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 7/10 - 9.241400512%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 8/10 - 9.453594857%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107
Epoch 9/10 - 8.986367391%.2fs - loss: 1093.0107 - accuracy: 0.7997812 - batches: 2107


In [None]:
preds = clf_pipelineModel.transform(testData)

preds.select('category','description',"class.result").show(10, truncate=80)

+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     Neg|It can be caused by infection of the donor or by contamination of the organ d...| [Neg]|
|     Neg|We report the clinical outcome of liver, heart, and kidney recipients from a ...| [Neg]|
|     Neg|With current donor evaluation protocols, the risk of transmitting infections ...| [Neg]|
|     Neg|Alertness returned over the 24 hr following by the discontinuation of BH-AC a...| [Neg]|
|     Neg|                              She was discharged without any neurologic sequela.| [Neg]|
|     Neg|                                                    Jarisch-Herxheimer reaction.| [Neg]|
|     Neg|The most common findings are fever, malaise, headache, and exacerbation of cu...| [Neg]|
|     Neg|

In [None]:

preds_df = preds.select('category','description',"class.result").toPandas()

In [None]:

# The result is an array since in Spark NLP you can have multiple sentences.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [None]:
preds_df.sample(10)

Unnamed: 0,category,description,result
4177,Neg,CONCLUSIONS: Confocal microscopy can be a usef...,Neg
1215,Neg,"BCG therapy prevents, or at least delays, tumo...",Neg
3000,Neg,No severe side effects were observed in the li...,Neg
1002,Neg,RESULTS: Thirteen patients had died; 1 of thes...,Neg
940,AE,Anterior ischemic optic neuropathy secondary t...,Neg
3249,Neg,Both phenotypic and genotypic virologic analys...,Neg
1662,AE,This is a rare case of ARDS associated with li...,Neg
613,Neg,Granulocyte colony stimulating factor (GCSF) w...,Neg
3278,Neg,No systemic antibiotics would be given.,Neg
102,Neg,Cultures became negative 10-37 days after the ...,Neg


In [None]:

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

          AE       0.00      0.00      0.00       881
         Neg       0.79      1.00      0.88      3341

    accuracy                           0.79      4222
   macro avg       0.40      0.50      0.44      4222
weighted avg       0.63      0.79      0.70      4222



  'precision', 'predicted', average, warn_for)


## ClassifierDL with Universal Sentence Embeddings

In [None]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
    .setBatchSize(8)\
    .setMaxEpochs(200)\
    .setLr(0.001)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
%%time

use_clf_pipelineModel = use_clf_pipeline.fit(trainingData)

CPU times: user 217 ms, sys: 130 ms, total: 347 ms
Wall time: 40min 9s


In [None]:
!cd ~/annotator_logs/ && ls -lt

In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_c36fd6c33b58.log
# with 10 epochs

Training started - total epochs: 10 - learning rate: 0.005 - batch size: 64 - training examples: 16855
Epoch 0/10 - 3.99512087%.2fs - loss: 135.09169 - accuracy: 0.80234593 - batches: 264
Epoch 1/10 - 4.766252183%.2fs - loss: 135.70131 - accuracy: 0.80240536 - batches: 264
Epoch 2/10 - 4.469659096%.2fs - loss: 135.69331 - accuracy: 0.80240536 - batches: 264
Epoch 3/10 - 4.600433395%.2fs - loss: 134.59521 - accuracy: 0.802333 - batches: 264
Epoch 4/10 - 3.730208099%.2fs - loss: 130.32632 - accuracy: 0.8104594 - batches: 264
Epoch 5/10 - 3.557576017%.2fs - loss: 126.97921 - accuracy: 0.8159716 - batches: 264
Epoch 6/10 - 3.795263992%.2fs - loss: 124.651344 - accuracy: 0.81983334 - batches: 264
Epoch 7/10 - 3.359740149%.2fs - loss: 123.539375 - accuracy: 0.823695 - batches: 264
Epoch 8/10 - 3.303243652%.2fs - loss: 122.89158 - accuracy: 0.8252397 - batches: 264
Epoch 9/10 - 3.300559809%.2fs - loss: 122.46075 - accuracy: 0.8275567 - batches: 264


In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_c38c29ef6024.log
# with 50 epochs

Epoch 40/50 - 7.241297522%.2fs - loss: 509.01236 - accuracy: 0.85082585 - batches: 1054
Epoch 41/50 - 7.430544655%.2fs - loss: 508.6808 - accuracy: 0.85147876 - batches: 1054
Epoch 42/50 - 7.475286196%.2fs - loss: 508.36176 - accuracy: 0.8517755 - batches: 1054
Epoch 43/50 - 7.275063993%.2fs - loss: 508.0474 - accuracy: 0.85213166 - batches: 1054
Epoch 44/50 - 7.053398349%.2fs - loss: 507.73767 - accuracy: 0.8527252 - batches: 1054
Epoch 45/50 - 7.019040194%.2fs - loss: 507.4303 - accuracy: 0.85331875 - batches: 1054
Epoch 46/50 - 6.971686473%.2fs - loss: 507.12305 - accuracy: 0.85355616 - batches: 1054
Epoch 47/50 - 7.245352824%.2fs - loss: 506.8202 - accuracy: 0.8542684 - batches: 1054
Epoch 48/50 - 7.451507109%.2fs - loss: 506.52145 - accuracy: 0.85474324 - batches: 1054
Epoch 49/50 - 7.284613728%.2fs - loss: 506.2306 - accuracy: 0.8551587 - batches: 1054


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

          AE       0.59      0.34      0.43       881
         Neg       0.84      0.94      0.89      3341

    accuracy                           0.81      4222
   macro avg       0.72      0.64      0.66      4222
weighted avg       0.79      0.81      0.79      4222



In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_917e76dbeb16.log
# with 100 epochs

Epoch 90/100 - 7.107793742%.2fs - loss: 490.2443 - accuracy: 0.8437627 - batches: 1054
Epoch 91/100 - 7.449995695%.2fs - loss: 490.06836 - accuracy: 0.84411883 - batches: 1054
Epoch 92/100 - 7.232593671%.2fs - loss: 489.8934 - accuracy: 0.8442969 - batches: 1054
Epoch 93/100 - 6.972248433%.2fs - loss: 489.71902 - accuracy: 0.8442969 - batches: 1054
Epoch 94/100 - 7.115250338%.2fs - loss: 489.5486 - accuracy: 0.844475 - batches: 1054
Epoch 95/100 - 7.230228216%.2fs - loss: 489.37936 - accuracy: 0.844653 - batches: 1054
Epoch 96/100 - 7.105980567%.2fs - loss: 489.2125 - accuracy: 0.8449498 - batches: 1054
Epoch 97/100 - 7.289306271%.2fs - loss: 489.0479 - accuracy: 0.84483105 - batches: 1054
Epoch 98/100 - 7.182141736%.2fs - loss: 488.88452 - accuracy: 0.84524655 - batches: 1054
Epoch 99/100 - 7.612609609%.2fs - loss: 488.72177 - accuracy: 0.8453059 - batches: 1054


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

          AE       0.60      0.32      0.42       881
         Neg       0.84      0.94      0.89      3341

    accuracy                           0.81      4222
   macro avg       0.72      0.63      0.65      4222
weighted avg       0.79      0.81      0.79      4222



In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_5c1c2243eac0.log
# with 50 epochs, bathc 8

Epoch 40/50 - 9.369026172%.2fs - loss: 930.22797 - accuracy: 0.8653253 - batches: 2107
Epoch 41/50 - 9.31812987%.2fs - loss: 929.064 - accuracy: 0.8658001 - batches: 2107
Epoch 42/50 - 9.304147095%.2fs - loss: 927.9366 - accuracy: 0.86633426 - batches: 2107
Epoch 43/50 - 10.075942242%.2fs - loss: 926.831 - accuracy: 0.8663936 - batches: 2107
Epoch 44/50 - 9.685581395%.2fs - loss: 925.7427 - accuracy: 0.86716527 - batches: 2107
Epoch 45/50 - 9.554822793%.2fs - loss: 924.67346 - accuracy: 0.8675807 - batches: 2107
Epoch 46/50 - 9.912348859%.2fs - loss: 923.62415 - accuracy: 0.8680556 - batches: 2107
Epoch 47/50 - 10.988392763%.2fs - loss: 922.58044 - accuracy: 0.86835235 - batches: 2107
Epoch 48/50 - 10.980015198%.2fs - loss: 921.5505 - accuracy: 0.86912394 - batches: 2107
Epoch 49/50 - 9.983640328%.2fs - loss: 920.54 - accuracy: 0.8697175 - batches: 2107


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

          AE       0.62      0.33      0.44       881
         Neg       0.84      0.95      0.89      3341

    accuracy                           0.82      4222
   macro avg       0.73      0.64      0.66      4222
weighted avg       0.80      0.82      0.80      4222



In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_5ba60b31eed5.log
# with 100 epochs, bathc 8

Epoch 90/100 - 9.787622493%.2fs - loss: 895.67804 - accuracy: 0.8953585 - batches: 2107
Epoch 91/100 - 9.489642055%.2fs - loss: 895.2891 - accuracy: 0.89553654 - batches: 2107
Epoch 92/100 - 9.617070758%.2fs - loss: 894.90814 - accuracy: 0.8955959 - batches: 2107
Epoch 93/100 - 9.910288144%.2fs - loss: 894.5366 - accuracy: 0.895774 - batches: 2107
Epoch 94/100 - 9.540695458%.2fs - loss: 894.17566 - accuracy: 0.8958927 - batches: 2107
Epoch 95/100 - 9.49872604%.2fs - loss: 893.8174 - accuracy: 0.8960708 - batches: 2107
Epoch 96/100 - 9.478969646%.2fs - loss: 893.4732 - accuracy: 0.8962488 - batches: 2107
Epoch 97/100 - 9.4266269%.2fs - loss: 893.12933 - accuracy: 0.89660496 - batches: 2107
Epoch 98/100 - 9.133465759%.2fs - loss: 892.7991 - accuracy: 0.8966643 - batches: 2107
Epoch 99/100 - 9.811887598%.2fs - loss: 892.47284 - accuracy: 0.8967236 - batches: 2107


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

          AE       0.62      0.30      0.40       881
         Neg       0.84      0.95      0.89      3341

    accuracy                           0.82      4222
   macro avg       0.73      0.62      0.65      4222
weighted avg       0.79      0.82      0.79      4222



In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_79a3a02f42b1.log
# with 200 epochs, bathc 8

Epoch 190/200 - 9.844052978%.2fs - loss: 882.0651 - accuracy: 0.9003358 - batches: 2107
Epoch 191/200 - 9.282441487%.2fs - loss: 881.87054 - accuracy: 0.90039515 - batches: 2107
Epoch 192/200 - 21.047282111%.2fs - loss: 881.67017 - accuracy: 0.9004545 - batches: 2107
Epoch 193/200 - 9.818631247%.2fs - loss: 881.4752 - accuracy: 0.9005138 - batches: 2107
Epoch 194/200 - 10.841587139%.2fs - loss: 881.2806 - accuracy: 0.9005732 - batches: 2107
Epoch 195/200 - 9.234538668%.2fs - loss: 881.0885 - accuracy: 0.90063256 - batches: 2107
Epoch 196/200 - 8.885203833%.2fs - loss: 880.89844 - accuracy: 0.90063256 - batches: 2107
Epoch 197/200 - 9.499566853%.2fs - loss: 880.71045 - accuracy: 0.9007513 - batches: 2107
Epoch 198/200 - 9.331700172%.2fs - loss: 880.5219 - accuracy: 0.9007513 - batches: 2107
Epoch 199/200 - 9.764149199%.2fs - loss: 880.3384 - accuracy: 0.9008106 - batches: 2107


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

          AE       0.60      0.34      0.43       881
         Neg       0.84      0.94      0.89      3341

    accuracy                           0.82      4222
   macro avg       0.72      0.64      0.66      4222
weighted avg       0.79      0.82      0.79      4222



## ClassifierDL with BERT Sentence Embeddings

In [12]:
from pyspark.sql.functions import col

trainingData.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


testData.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|     Neg|13430|
|      AE| 3432|
+--------+-----+

+--------+-----+
|category|count|
+--------+-----+
|     Neg| 3385|
|      AE|  830|
+--------+-----+



In [13]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")
    
bert_sent = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_768")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setBatchSize(8)\
  .setLr()
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])

sent_small_bert_L2_768 download started this may take some time.
Approximate size to download 139.6 MB
[OK!]


In [14]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)
# 5 epochs takes around 10

CPU times: user 95 ms, sys: 16 ms, total: 111 ms
Wall time: 8min 16s


In [15]:
preds = bert_clf_pipeline.transform(testData)

preds.select('category','description',"class.result").show(10, truncate=80)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     Neg|Emphasis is placed on the significance of the possible relationship and simil...| [Neg]|
|     Neg|Two case reports and data from literature on the subject are used by the auth...| [Neg]|
|     Neg|                            Two case reports and their theoretical implications.| [Neg]|
|     Neg|A patient is described with polycythemia vera who was taking anticoagulants a...| [Neg]|
|     Neg|It is suggested that the long-term use of androgenic-anabolic steroids is the...| [Neg]|
|     Neg|Paradoxical lithium neurotoxicity: a report of five cases and a hypothesis ab...| [Neg]|
|     Neg|The patients who developed neurotoxicity had markedly higher global ratings o...| [Neg]|
|     Neg|

  _warn_prf(average, modifier, msg_start, len(result))


### Lets do some tuning

In [27]:
bert_sent = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_768")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])

sent_small_bert_L2_768 download started this may take some time.
Approximate size to download 139.6 MB
[OK!]


In [28]:
classsifierdl.getDropout()

0.5

In [25]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)


CPU times: user 93 ms, sys: 29.7 ms, total: 123 ms
Wall time: 9min 14s


In [26]:
preds = bert_clf_pipeline.transform(testData)

preds.select('category','description',"class.result").show(10, truncate=80)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     Neg|Emphasis is placed on the significance of the possible relationship and simil...| [Neg]|
|     Neg|Two case reports and data from literature on the subject are used by the auth...| [Neg]|
|     Neg|                            Two case reports and their theoretical implications.| [Neg]|
|     Neg|A patient is described with polycythemia vera who was taking anticoagulants a...| [Neg]|
|     Neg|It is suggested that the long-term use of androgenic-anabolic steroids is the...| [Neg]|
|     Neg|Paradoxical lithium neurotoxicity: a report of five cases and a hypothesis ab...|  [AE]|
|     Neg|The patients who developed neurotoxicity had markedly higher global ratings o...|  [AE]|
|     Neg|

In [30]:
# 15 epochs

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(15)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])

In [31]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)


CPU times: user 121 ms, sys: 17.5 ms, total: 138 ms
Wall time: 11min 48s


In [32]:
preds = bert_clf_pipeline.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


              precision    recall  f1-score   support

          AE       0.64      0.72      0.68       737
         Neg       0.94      0.91      0.93      3478

    accuracy                           0.88      4215
   macro avg       0.79      0.82      0.80      4215
weighted avg       0.89      0.88      0.88      4215



In [36]:
from sklearn.metrics import accuracy_score

print (accuracy_score(preds_df['result'], preds_df['category']))


0.8773428232502966


In [None]:
## 

### Upsampling the training set

In [33]:
trainingData_aug = trainingData.union(trainingData).union(trainingData)


In [34]:
# 15 epochs

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(15)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)


bert_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData_aug)


In [35]:
preds = bert_clf_pipeline.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['category']))


              precision    recall  f1-score   support

          AE       0.65      0.70      0.68       771
         Neg       0.93      0.92      0.92      3444

    accuracy                           0.88      4215
   macro avg       0.79      0.81      0.80      4215
weighted avg       0.88      0.88      0.88      4215



### with ade_df2 (almost same as ade_df, just for testing)

In [40]:

spark_df2 = spark.createDataFrame(ade_df2)

(trainingData, testData) = spark_df2.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 16716
Test Dataset Count: 4180


In [41]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("label")\
  .setMaxEpochs(10)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)


bert_clf_pipeline = Pipeline(
    stages = [
        document,
        bert_sent,
        classsifierdl
    ])

bert_clf_pipeline2 = bert_clf_pipeline.fit(trainingData)


In [43]:
preds = bert_clf_pipeline2.transform(testData)

preds_df = preds.select('label','text',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['result'], preds_df['label']))
print (accuracy_score(preds_df['result'], preds_df['label']))


              precision    recall  f1-score   support

    negative       0.62      0.65      0.63       775
    positive       0.92      0.91      0.91      3405

    accuracy                           0.86      4180
   macro avg       0.77      0.78      0.77      4180
weighted avg       0.86      0.86      0.86      4180

0.8614832535885167


In [47]:
ade_df2.label.value_counts(normalize=True)

positive    0.795607
negative    0.204393
Name: label, dtype: float64

In [48]:
ade_df2.label.value_counts()

positive    16625
negative     4271
Name: label, dtype: int64