#  Adverse Drug Event Classifier


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/clinical_text_classification/2.Drug_Adverse_Event_Classification.ipynb)

In [None]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [None]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")
      
    return builder.getOrCreate()

#spark = start(SECRET)

In [3]:
import json
import os

from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

import sparknlp_jsl
import sparknlp

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
from sparknlp.common import *

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", # Amount of memory to use for the driver process, i.e. where SparkContext is initialized
          "spark.kryoserializer.buffer.max":"2000M", # Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified. 
          "spark.driver.maxResultSize":"2000M"} # Limit of total size of serialized results of all partitions for each Spark action (e.g. collect) in bytes. 
                                                # Should be at least 1M, or 0 for unlimited. 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params, gpu=True)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.0.2
Spark NLP_JSL Version : 4.0.2


### Data Preprocessing

In [None]:
#downloading sample datasets
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/ADE_Corpus_V2/ADE-NEG.txt
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/ADE_Corpus_V2/DRUG-AE.rel

**ADE-Negative Dataset**

In [None]:
df_1= pd.read_csv("/content/ADE-NEG.txt", header=None, delimiter="\t", names=["col1"])
df_1.head()

Unnamed: 0,col1
0,6460590 NEG Clioquinol intoxication occurring ...
1,"8600337 NEG ""Retinoic acid syndrome"" was preve..."
2,8402502 NEG BACKGROUND: External beam radiatio...
3,"8700794 NEG Although the enuresis ceased, she ..."
4,17662448 NEG A 42-year-old woman had uneventfu...


In [None]:
df_1['description'] =  df_1.col1.str.split('NEG').str[1]
df_1["category"] = "neg"
df_1= df_1[["description", "category"]]
df_1.head()

Unnamed: 0,description,category
0,Clioquinol intoxication occurring in the trea...,neg
1,"""Retinoic acid syndrome"" was prevented with s...",neg
2,BACKGROUND: External beam radiation therapy o...,neg
3,"Although the enuresis ceased, she developed t...",neg
4,A 42-year-old woman had uneventful bilateral ...,neg


In [None]:
df_1.count()

description    16695
category       16695
dtype: int64

**ADE Positive Dataset**

In [None]:
df_2= pd.read_csv("/content/DRUG-AE.rel", header=None, delimiter="|")
df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,10030778,Intravenous azithromycin-induced ototoxicity.,ototoxicity,43,54,azithromycin,22,34
1,10048291,"Immobilization, while Paget's bone disease was...",increased calcium-release,960,985,dihydrotachysterol,908,926
2,10048291,Unaccountable severe hypercalcemia in a patien...,hypercalcemia,31,44,dihydrotachysterol,94,112
3,10082597,METHODS: We report two cases of pseudoporphyri...,pseudoporphyria,620,635,naproxen,646,654
4,10082597,METHODS: We report two cases of pseudoporphyri...,pseudoporphyria,620,635,oxaprozin,659,668


In [None]:
df_2[["category"]]= "pos"
df_2.rename(columns={1: "description"}, inplace=True)
df_2= df_2[["description", "category"]]
df_2.head()

Unnamed: 0,description,category
0,Intravenous azithromycin-induced ototoxicity.,pos
1,"Immobilization, while Paget's bone disease was...",pos
2,Unaccountable severe hypercalcemia in a patien...,pos
3,METHODS: We report two cases of pseudoporphyri...,pos
4,METHODS: We report two cases of pseudoporphyri...,pos


In [None]:
df_2.count()

description    6821
category       6821
dtype: int64

**Merging Positive and Negative dataset**

In [None]:
ade_df= pd.concat([df_1, df_2])
ade_df.head()

Unnamed: 0,description,category
0,Clioquinol intoxication occurring in the trea...,neg
1,"""Retinoic acid syndrome"" was prevented with s...",neg
2,BACKGROUND: External beam radiation therapy o...,neg
3,"Although the enuresis ceased, she developed t...",neg
4,A 42-year-old woman had uneventful bilateral ...,neg


In [None]:
ade_df[["category"]].value_counts()

category
neg         16695
pos          6821
dtype: int64

In [None]:
ade_df.count()

description    23516
category       23516
dtype: int64

In [None]:
spark_df = spark.createDataFrame(ade_df)

(trainingData, testData) = spark_df.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 18758
Test Dataset Count: 4758


## Text cleaning in Spark NLP and CV + LogReg with Spark ML

In [None]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
           countVectors,
           label_stringIdx])

nlp_model = nlp_pipeline.fit(spark_df)

processed = nlp_model.transform(spark_df)

processed.count()

23516

In [None]:
(trainingData, testData) = processed.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 18855
Test Dataset Count: 4661


In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 1) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|CONCLUSION: Alendronate led...|     pos|[0.49974193018407226,0.5002...|  1.0|       1.0|
| The effects of lorcainide,...|     neg|[0.49933435423839734,0.5006...|  0.0|       1.0|
|Drug induced polymyositis s...|     pos|[0.49923930906950914,0.5007...|  1.0|       1.0|
|Prolonged prostate-specific...|     pos|[0.49883713639022553,0.5011...|  1.0|       1.0|
|Rebound hyperglycemia follo...|     pos|[0.49807055264010996,0.5019...|  1.0|       1.0|
|It was restarted 6 weeks la...|     pos|[0.49717020852263233,0.5028...|  1.0|       1.0|
|Generalized lichen nitidus ...|     pos|[0.49684918505672215,0.5031...|  1.0|       1.0|
|We report on a young adoles...|     pos|[0.49660345181915067,0.5033...|  1.0|       1.0|
| Severe 5

In [None]:
preds_df = predictions.select('category','description',"prediction",'label').toPandas()

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.83      0.97      0.90      3311
         1.0       0.89      0.53      0.66      1350

    accuracy                           0.84      4661
   macro avg       0.86      0.75      0.78      4661
weighted avg       0.85      0.84      0.83      4661



## Text cleaning in Spark NLP and TfIDF + LogReg with Spark ML


In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
           idf,
           label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(spark_df)

processed_tf = nlp_model_tf.transform(spark_df)

processed_tf.count()


23516

In [None]:
(trainingData, testData) = processed_tf.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 18855
Test Dataset Count: 4661


In [None]:

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| For nearly half a century,...|     neg|[0.9946444041060146,0.00535...|  0.0|       0.0|
| Bronchoalveolar lavage (BA...|     neg|[0.9917729141098448,0.00822...|  0.0|       0.0|
| RESULTS: Extensive laborat...|     neg|[0.9902638495158449,0.00973...|  0.0|       0.0|
| RESULTS: Exposure to the v...|     neg|[0.9894311964125672,0.01056...|  0.0|       0.0|
| Careful donor evaluation b...|     neg|[0.9857071869866806,0.01429...|  0.0|       0.0|
| Cerebrospinal fluid smear ...|     neg|[0.982740508253554,0.017259...|  0.0|       0.0|
| CONCLUSION: An indium (111...|     neg|[0.9820077197993587,0.01799...|  0.0|       0.0|
| The cases, as compared wit...|     neg|[0.9817324634528142,0.01826...|  0.0|       0.0|
| These fi

In [None]:
preds_df = predictions.select('category','description',"prediction",'label').toPandas()

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.82      0.97      0.89      3311
         1.0       0.88      0.49      0.63      1350

    accuracy                           0.83      4661
   macro avg       0.85      0.73      0.76      4661
weighted avg       0.84      0.83      0.82      4661



## Text cleaning and featurizing in Spark NLP and LogReg with Spark ML


In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models')\
          .setInputCols(["document", "cleanTokens"])\
          .setOutputCol("embeddings")\
        .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_w2v = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            clinical_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
           label_stringIdx])

nlp_model_w2v = nlp_pipeline_w2v.fit(spark_df)

processed_w2v = nlp_model_w2v.transform(spark_df)

processed_w2v.count()


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


23516

In [None]:
(trainingData, testData) = processed_w2v.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 18855
Test Dataset Count: 4661


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)
testData.count()

4661

In [None]:
trainingData = trainingData.where(num_nonzeros("features") != 0)
trainingData.count()

18849

In [None]:

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions\
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| Blood cultures were negative.|     neg|[0.9930374804963886,0.00696...|  0.0|       0.0|
| A prompt and accurate diag...|     neg|[0.9870876807714636,0.01291...|  0.0|       0.0|
| Diagnosis was confirmed by...|     neg|[0.9857802488552496,0.01421...|  0.0|       0.0|
| Subsequently, the pedicle ...|     neg|[0.9851161423499767,0.01488...|  0.0|       0.0|
|  His recovery was uneventful.|     neg|[0.9839023068014584,0.01609...|  0.0|       0.0|
|  His recovery was uneventful.|     neg|[0.9839023068014584,0.01609...|  0.0|       0.0|
| Sensation of all modalitie...|     neg|[0.9827532044461809,0.01724...|  0.0|       0.0|
| DATA SOURCES: All relevant...|     neg|[0.9826784317594773,0.01732...|  0.0|       0.0|
| Culture 

In [None]:
preds_df = predictions.select('category','description',"prediction",'label').toPandas()

from sklearn.metrics import classification_report

print (classification_report(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.78      0.95      0.85      3313
         1.0       0.71      0.33      0.45      1348

    accuracy                           0.77      4661
   macro avg       0.75      0.64      0.65      4661
weighted avg       0.76      0.77      0.74      4661



## Text cleaning, featurizing and classification in Spark NLP


In [None]:
!wget https://raw.githubusercontent.com/kavgan/clinical-concepts/master/clinical-stopwords.txt

In [None]:
with open ('clinical-stopwords.txt', 'r') as f:
    stops = f.readlines()

stops = [s.strip() for s in stops[1:]]
stops[:3]

['x', 'y', 'your']

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setStopWords(stops)\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["document", "lemma"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(10)\
    .setBatchSize(8)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            clinical_embeddings,
            embeddingsSentence,
            classsifierdl])


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [None]:
%%time

clf_pipelineModel = clf_pipeline.fit(trainingData)

CPU times: user 617 ms, sys: 118 ms, total: 735 ms
Wall time: 1min 19s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 4
-rw-r--r-- 1 root root 818 Jul 21 12:06 ClassifierDLApproach_37e6ad7df474.log


In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_37e6ad7df474.log

Training started - epochs: 10 - learning_rate: 0.005 - batch_size: 8 - training_examples: 18793 - classes: 2
Epoch 0/10 - 6.51s - loss: 1442.6681 - acc: 0.7096637 - batches: 2350
Epoch 1/10 - 4.45s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 2/10 - 4.28s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 3/10 - 4.21s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 4/10 - 4.21s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 5/10 - 4.20s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 6/10 - 4.24s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 7/10 - 4.28s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 8/10 - 4.23s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350
Epoch 9/10 - 4.21s - loss: 1442.1212 - acc: 0.70987654 - batches: 2350


In [None]:
preds = clf_pipelineModel.transform(testData)

preds.select('category','description',"class.result").show(10, truncate=80)

+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     neg| "Syndrome malin"-like symptoms probably due to interaction between neurolept...| [neg]|
|     neg|                                       (It is no longer contained in Correctol.)| [neg]|
|     neg| 2-Chlordeoxyadenosine (2-CdA) is an antineoplastic/immunosuppressive agent u...| [neg]|
|     neg|         5: Movement disorders I: parkinsonism and the akinetic-rigid syndromes.| [neg]|
|     neg| A 13-year-old boy was diagnosed as acute lymphoblastic leukemia following ra...| [neg]|
|     neg| A 16-year-old girl with pulmonary stenosis who underwent reconstruction of t...| [neg]|
|     neg| A 27-year-old man presented with corneal ectasia in his left eye 4 years aft...| [neg]|
|     neg|

In [None]:
preds_df = preds.select('category','description',"class.result").toPandas()

In [None]:
# The result is an array since you can have multiple sentences in Spark NLP.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [None]:
preds_df.sample(10)

Unnamed: 0,category,description,result
4618,pos,We present the cases of two female patients di...,neg
2706,neg,Intravenous phenytoin was urgently administer...,neg
3344,neg,"When catheter disconnection, kink, or dislodg...",neg
4601,pos,We describe two women who developed HUS after ...,neg
681,neg,Successful management was made possible by a ...,neg
4421,pos,The patient described feeling cold with worsen...,neg
1260,neg,Capillary leak syndrome (CLS) commonly occurs...,neg
1150,neg,An objective causality assessment revealed th...,neg
4612,pos,We present a case of ethylenediamine-induced d...,neg
2477,neg,Childhood dermatomyositis and polymyositis.,neg


In [None]:
from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

         neg       0.71      1.00      0.83      3362
         pos       0.00      0.00      0.00      1361

    accuracy                           0.71      4723
   macro avg       0.36      0.50      0.42      4723
weighted avg       0.51      0.71      0.59      4723



#### Cope With Imbalanced Data
We will decrease the number of "neg" values in training set in order to make the training set balanced. 

In [None]:
#existing size of categories
print("Number of negative values: \n {}".format(df_1.count()))
print("Number of positive values: \n {}".format(df_2.count()))

Number of negative values: 
 description    16695
category       16695
dtype: int64
Number of positive values: 
 description    6821
category       6821
dtype: int64


In [None]:
df_1_limited= df_1[:6821] #df_1 -> negative values

ade_df_balanced= pd.concat([df_1_limited, df_2])
ade_df_balanced.category.value_counts()

neg    6821
pos    6821
Name: category, dtype: int64

In [None]:
spark_df_balanced = spark.createDataFrame(ade_df_balanced)

(tr_balanced, test_balanced) = spark_df_balanced.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset size: " + str(tr_balanced.count()))
print("Test Dataset Count: " + str(test_balanced.count()))

Training Dataset size: 10894
Test Dataset Count: 2748


In [None]:
tr_balanced.groupBy("category").count().show()

+--------+-----+
|category|count|
+--------+-----+
|     pos| 5440|
|     neg| 5454|
+--------+-----+



In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setStopWords(stops)\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["document", "lemma"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(10)\
    .setBatchSize(8)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            clinical_embeddings,
            embeddingsSentence,
            classsifierdl])


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [None]:
%%time

clf_pipelineModel = clf_pipeline.fit(tr_balanced)

CPU times: user 406 ms, sys: 88.8 ms, total: 495 ms
Wall time: 40 s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 8
-rw-r--r-- 1 root root 805 Jul 21 12:42 ClassifierDLApproach_88989fde256c.log
-rw-r--r-- 1 root root 818 Jul 21 12:06 ClassifierDLApproach_37e6ad7df474.log


In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_88989fde256c.log

Training started - epochs: 10 - learning_rate: 0.005 - batch_size: 8 - training_examples: 10894 - classes: 2
Epoch 0/10 - 2.71s - loss: 728.03687 - acc: 0.7819312 - batches: 1362
Epoch 1/10 - 2.46s - loss: 650.66187 - acc: 0.8348335 - batches: 1362
Epoch 2/10 - 2.46s - loss: 630.06805 - acc: 0.8535085 - batches: 1362
Epoch 3/10 - 2.47s - loss: 614.74176 - acc: 0.8675606 - batches: 1362
Epoch 4/10 - 2.46s - loss: 598.3864 - acc: 0.87885743 - batches: 1362
Epoch 5/10 - 2.47s - loss: 587.05133 - acc: 0.88473547 - batches: 1362
Epoch 6/10 - 2.47s - loss: 579.38 - acc: 0.89235854 - batches: 1362
Epoch 7/10 - 2.47s - loss: 570.0333 - acc: 0.89741 - batches: 1362
Epoch 8/10 - 2.49s - loss: 564.0628 - acc: 0.90236956 - batches: 1362
Epoch 9/10 - 2.50s - loss: 558.49133 - acc: 0.9041146 - batches: 1362


In [None]:
preds = clf_pipelineModel.transform(test_balanced)

preds.select('category','description',"class.result").show(10, truncate=80)

+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     neg| 'Bail-out' bivalirudin use in patients with thrombotic complications unrespo...| [neg]|
|     neg|         5: Movement disorders I: parkinsonism and the akinetic-rigid syndromes.| [pos]|
|     neg| A 17-year-old anuric female patient with end-stage renal failure received a ...| [pos]|
|     neg| A 28-year-old female patient complained of intractable pain in the left arm,...| [neg]|
|     neg| A 31-year-old female was found to have FIGO Stage IIB squamous cell carcinom...| [neg]|
|     neg| A 34-year-old male with lumbar disc disease and surgery was placed on gabape...| [neg]|
|     neg| A 42-year-old woman had uneventful bilateral laser-assisted subepithelial ke...| [neg]|
|     neg|

In [None]:
preds_df = preds.select('category','description',"class.result").toPandas()

In [None]:
# The result is an array since you can have multiple sentences in Spark NLP.
# Let's explode the array and get the item(s) inside of result column out
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [None]:
preds_df.sample(10)

Unnamed: 0,category,description,result
1339,neg,We report a case of a 72-year-old female who ...,neg
701,neg,As the disfiguring lesions had a marked adver...,neg
1460,pos,A 65-year-old man on warfarin therapy with a s...,pos
645,neg,AML/MDS occurred in 3/1374 CLL patients seen ...,neg
908,neg,"In 1984, a 56-year-old house painter develope...",neg
694,neg,Angioimmunoblastic lymphadenopathy (AILD) may...,pos
1463,pos,A 7-year-old girl developed diabetes mellitus ...,pos
1977,pos,We believe that the acute renal failure in our...,pos
1127,neg,Systemic complications associated with retina...,neg
262,neg,Monitoring of the long-term safety profile of...,neg


In [None]:
from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

         neg       0.83      0.86      0.85      1367
         pos       0.86      0.83      0.84      1381

    accuracy                           0.85      2748
   macro avg       0.85      0.85      0.85      2748
weighted avg       0.85      0.85      0.85      2748



## ClassifierDL with Universal Sentence Embeddings

In [None]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")
    
# we can also use sentece detector here if we want to train on and get predictions for each sentence

use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setBatchSize(8)\
  .setMaxEpochs(10)\
  .setLr(0.001)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
%%time

use_clf_pipelineModel = use_clf_pipeline.fit(trainingData)

CPU times: user 568 ms, sys: 68.4 ms, total: 636 ms
Wall time: 1min 54s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 4
-rw-r--r-- 1 root root 823 Jul 22 10:25 ClassifierDLApproach_0d0188b956ca.log


In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_0d0188b956ca.log
# with 10 epochs

Training started - epochs: 10 - learning_rate: 0.001 - batch_size: 8 - training_examples: 18758 - classes: 2
Epoch 0/10 - 10.19s - loss: 1244.0745 - acc: 0.76171434 - batches: 2345
Epoch 1/10 - 10.18s - loss: 1172.0829 - acc: 0.79168445 - batches: 2345
Epoch 2/10 - 10.01s - loss: 1151.2114 - acc: 0.80330986 - batches: 2345
Epoch 3/10 - 9.99s - loss: 1136.3204 - acc: 0.81232226 - batches: 2345
Epoch 4/10 - 10.07s - loss: 1124.102 - acc: 0.8190949 - batches: 2345
Epoch 5/10 - 10.10s - loss: 1113.3309 - acc: 0.82433873 - batches: 2345
Epoch 6/10 - 10.22s - loss: 1103.3389 - acc: 0.8300448 - batches: 2345
Epoch 7/10 - 10.01s - loss: 1094.1273 - acc: 0.83425766 - batches: 2345
Epoch 8/10 - 10.07s - loss: 1085.757 - acc: 0.83900386 - batches: 2345
Epoch 9/10 - 10.17s - loss: 1078.0099 - acc: 0.8423635 - batches: 2345


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

         neg       0.82      0.92      0.87      3367
         pos       0.73      0.52      0.61      1391

    accuracy                           0.80      4758
   macro avg       0.77      0.72      0.74      4758
weighted avg       0.79      0.80      0.79      4758



In [None]:
# with 50 epochs
# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setBatchSize(8)\
  .setMaxEpochs(50)\
  .setLr(0.001)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [None]:
%%time

use_clf_pipelineModel = use_clf_pipeline.fit(trainingData)

CPU times: user 2.35 s, sys: 214 ms, total: 2.56 s
Wall time: 8min 18s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 8
-rw-r--r-- 1 root root 3657 Jul 22 10:36 ClassifierDLApproach_df5632645d19.log
-rw-r--r-- 1 root root  823 Jul 22 10:25 ClassifierDLApproach_0d0188b956ca.log


In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_df5632645d19.log

Epoch 40/50 - 9.81s - loss: 987.13257 - acc: 0.89259815 - batches: 2345
Epoch 41/50 - 9.54s - loss: 985.7355 - acc: 0.8931847 - batches: 2345
Epoch 42/50 - 9.57s - loss: 984.3489 - acc: 0.89403796 - batches: 2345
Epoch 43/50 - 9.45s - loss: 983.00433 - acc: 0.8947312 - batches: 2345
Epoch 44/50 - 9.49s - loss: 981.68994 - acc: 0.89521116 - batches: 2345
Epoch 45/50 - 9.50s - loss: 980.44336 - acc: 0.8957978 - batches: 2345
Epoch 46/50 - 9.46s - loss: 979.24133 - acc: 0.8965977 - batches: 2345
Epoch 47/50 - 9.59s - loss: 978.0944 - acc: 0.89713097 - batches: 2345
Epoch 48/50 - 9.99s - loss: 977.00653 - acc: 0.8971843 - batches: 2345
Epoch 49/50 - 9.97s - loss: 975.9665 - acc: 0.89766425 - batches: 2345


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

         neg       0.85      0.91      0.88      3367
         pos       0.73      0.61      0.66      1391

    accuracy                           0.82      4758
   macro avg       0.79      0.76      0.77      4758
weighted avg       0.81      0.82      0.81      4758



In [None]:
# with 100 epochs
# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setBatchSize(8)\
  .setMaxEpochs(100)\
  .setLr(0.001)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [None]:
%%time

use_clf_pipelineModel = use_clf_pipeline.fit(trainingData)

CPU times: user 4.43 s, sys: 489 ms, total: 4.92 s
Wall time: 15min 17s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 16
-rw-r--r-- 1 root root 7292 Jul 22 10:53 ClassifierDLApproach_64bc0aa43c09.log
-rw-r--r-- 1 root root 3657 Jul 22 10:36 ClassifierDLApproach_df5632645d19.log
-rw-r--r-- 1 root root  823 Jul 22 10:25 ClassifierDLApproach_0d0188b956ca.log


In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_64bc0aa43c09.log

Epoch 90/100 - 8.82s - loss: 941.815 - acc: 0.90577006 - batches: 2345
Epoch 91/100 - 8.92s - loss: 941.5295 - acc: 0.90582335 - batches: 2345
Epoch 92/100 - 9.09s - loss: 941.2418 - acc: 0.9058767 - batches: 2345
Epoch 93/100 - 8.37s - loss: 940.94806 - acc: 0.9058767 - batches: 2345
Epoch 94/100 - 8.34s - loss: 940.6439 - acc: 0.9059834 - batches: 2345
Epoch 95/100 - 8.02s - loss: 940.35126 - acc: 0.9060367 - batches: 2345
Epoch 96/100 - 8.10s - loss: 940.0267 - acc: 0.90614337 - batches: 2345
Epoch 97/100 - 8.19s - loss: 939.7086 - acc: 0.90630335 - batches: 2345
Epoch 98/100 - 8.80s - loss: 939.417 - acc: 0.90641 - batches: 2345
Epoch 99/100 - 8.97s - loss: 939.1222 - acc: 0.9064633 - batches: 2345


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

         neg       0.85      0.91      0.88      3367
         pos       0.74      0.60      0.66      1391

    accuracy                           0.82      4758
   macro avg       0.79      0.76      0.77      4758
weighted avg       0.82      0.82      0.82      4758



In [None]:
# with 200 epochs
# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setBatchSize(8)\
  .setMaxEpochs(200)\
  .setLr(0.001)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [None]:
%%time

use_clf_pipelineModel = use_clf_pipeline.fit(trainingData)

CPU times: user 6.72 s, sys: 758 ms, total: 7.47 s
Wall time: 23min 4s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 16
-rw-r--r-- 1 root root 14593 Jul 22 13:44 ClassifierDLApproach_c65df720c29a.log


In [None]:
!tail -10 ~/annotator_logs/ClassifierDLApproach_c65df720c29a.log

Epoch 190/200 - 6.62s - loss: 979.1136 - acc: 0.9072277 - batches: 2345
Epoch 191/200 - 6.46s - loss: 979.102 - acc: 0.90728104 - batches: 2345
Epoch 192/200 - 6.77s - loss: 979.0946 - acc: 0.9073344 - batches: 2345
Epoch 193/200 - 6.80s - loss: 979.06445 - acc: 0.907441 - batches: 2345
Epoch 194/200 - 6.72s - loss: 979.03687 - acc: 0.90749437 - batches: 2345
Epoch 195/200 - 6.73s - loss: 979.0125 - acc: 0.90749437 - batches: 2345
Epoch 196/200 - 6.62s - loss: 978.9937 - acc: 0.90749437 - batches: 2345
Epoch 197/200 - 6.76s - loss: 978.976 - acc: 0.90749437 - batches: 2345
Epoch 198/200 - 6.65s - loss: 978.96173 - acc: 0.90754765 - batches: 2345
Epoch 199/200 - 6.62s - loss: 978.9468 - acc: 0.90754765 - batches: 2345


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))

              precision    recall  f1-score   support

         neg       0.85      0.91      0.88      3367
         pos       0.74      0.60      0.67      1391

    accuracy                           0.82      4758
   macro avg       0.80      0.76      0.77      4758
weighted avg       0.82      0.82      0.82      4758



## ClassifierDL with BioBert Embeddings  (The best metric on this dataset)

In [None]:
spark_df = spark.createDataFrame(ade_df)

(trainingData, testData) = spark_df.randomSplit([0.8, 0.2], seed = 100)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 18793
Test Dataset Count: 4723


In [None]:
spark_df.columns

['description', 'category']

In [None]:
from pyspark.sql.functions import col

trainingData.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


testData.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|     neg|13333|
|     pos| 5460|
+--------+-----+

+--------+-----+
|category|count|
+--------+-----+
|     neg| 3362|
|     pos| 1361|
+--------+-----+



In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

bert_embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased", "en", "public/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")\
#.setMaxSentenceLength(512)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["sentence", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setBatchSize(8)\
  .setMaxEpochs(50)\
  .setLr(0.0003)\
  .setOutputLogsPath('./')\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        bert_embeddings,
        embeddingsSentence,
        classifierdl
    ])

biobert_pubmed_base_cased download started this may take some time.
Approximate size to download 386.4 MB
[OK!]


In [None]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)
# 50 epochs takes around 5 min

CPU times: user 1.75 s, sys: 183 ms, total: 1.93 s
Wall time: 5min 59s


In [None]:
!tail -n 5 /content/ClassifierDLApproach_3f720fb5d32a.log

Epoch 45/50 - 4.00s - loss: 924.207 - acc: 0.9266709 - batches: 2350
Epoch 46/50 - 4.07s - loss: 923.63983 - acc: 0.92688376 - batches: 2350
Epoch 47/50 - 4.07s - loss: 923.08655 - acc: 0.92720306 - batches: 2350
Epoch 48/50 - 4.05s - loss: 922.5467 - acc: 0.92752236 - batches: 2350
Epoch 49/50 - 3.99s - loss: 922.018 - acc: 0.927682 - batches: 2350


In [None]:
preds = bert_clf_pipeline.transform(testData)

preds.select('category','description',"class.result").show(10, truncate=80)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))


+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     neg| "Syndrome malin"-like symptoms probably due to interaction between neurolept...| [neg]|
|     neg|                                       (It is no longer contained in Correctol.)| [neg]|
|     neg| 2-Chlordeoxyadenosine (2-CdA) is an antineoplastic/immunosuppressive agent u...| [neg]|
|     neg|         5: Movement disorders I: parkinsonism and the akinetic-rigid syndromes.| [neg]|
|     neg| A 13-year-old boy was diagnosed as acute lymphoblastic leukemia following ra...| [neg]|
|     neg| A 16-year-old girl with pulmonary stenosis who underwent reconstruction of t...| [neg]|
|     neg| A 27-year-old man presented with corneal ectasia in his left eye 4 years aft...| [neg]|
|     neg|

## ClassifierDL with BertSentenceEmbeddings

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("sentence")

bert_sent = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_768")\
 .setInputCols(["sentence"])\
 .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        bert_sent,
        classsifierdl
    ])

sent_small_bert_L2_768 download started this may take some time.
Approximate size to download 139.6 MB
[OK!]


In [None]:
classsifierdl.getDropout()

0.5

In [None]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)


CPU times: user 284 ms, sys: 32.5 ms, total: 317 ms
Wall time: 52.7 s


In [None]:
preds = bert_clf_pipeline.transform(testData)

preds.select('category','description',"class.result").show(10, truncate=80)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))


+--------+--------------------------------------------------------------------------------+------+
|category|                                                                     description|result|
+--------+--------------------------------------------------------------------------------+------+
|     neg| "Syndrome malin"-like symptoms probably due to interaction between neurolept...| [pos]|
|     neg|                                       (It is no longer contained in Correctol.)| [neg]|
|     neg| 2-Chlordeoxyadenosine (2-CdA) is an antineoplastic/immunosuppressive agent u...| [neg]|
|     neg|         5: Movement disorders I: parkinsonism and the akinetic-rigid syndromes.| [neg]|
|     neg| A 13-year-old boy was diagnosed as acute lymphoblastic leukemia following ra...| [neg]|
|     neg| A 16-year-old girl with pulmonary stenosis who underwent reconstruction of t...| [pos]|
|     neg| A 27-year-old man presented with corneal ectasia in his left eye 4 years aft...| [neg]|
|     neg|

In [None]:
# 15 epochs

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(15)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        bert_sent,
        classsifierdl
    ])

In [None]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)


CPU times: user 458 ms, sys: 53.5 ms, total: 512 ms
Wall time: 1min 30s


In [None]:
preds = bert_clf_pipeline.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))


              precision    recall  f1-score   support

         neg       0.86      0.90      0.88      3362
         pos       0.73      0.65      0.69      1361

    accuracy                           0.83      4723
   macro avg       0.80      0.78      0.79      4723
weighted avg       0.83      0.83      0.83      4723



In [None]:
from sklearn.metrics import accuracy_score

print (accuracy_score(preds_df['category'], preds_df['result']))


0.8310395934787211


### Upsampling the training set

In [None]:
trainingData_aug = trainingData.union(trainingData).union(trainingData)

In [None]:
print("Training size: {}".format(trainingData_aug.count()))
print("Test size: {}".format(testData.count()))

Training size: 56379
Test size: 4723


In [None]:
# 15 epochs

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(15)\
  .setBatchSize(8)\
  .setLr(1e-4)\
  .setEnableOutputLogs(True)


bert_clf_pipeline = Pipeline(
    stages = [
        document_assembler,
        bert_sent,
        classsifierdl
    ])

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData_aug)


In [None]:
preds = bert_clf_pipeline.transform(testData)

preds_df = preds.select('category','description',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

# We are going to use sklearn to evalute the results on test dataset

from sklearn.metrics import classification_report

print (classification_report(preds_df['category'], preds_df['result']))


              precision    recall  f1-score   support

         neg       0.90      0.92      0.91      3362
         pos       0.78      0.74      0.76      1361

    accuracy                           0.86      4723
   macro avg       0.84      0.83      0.83      4723
weighted avg       0.86      0.86      0.86      4723

