# Annotating Data & Training Custom NER model for Suicide Risk Prediction

In this tutorial, we'll see how we can easily annotate data using pre-defined vocabulary / key-word matching, and upload them as pre-annotations to NLPLab.

We'll be using the AnnotationLab module to create, configure, export and import projects using the AnnotationLab module with minimal code.

Note: The Annotation Lab module is available in Spark NLP for Healthcare 4.2.1+

## 0. Initial Configurations

In [0]:
import pandas as pd
import os

import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.base import *
from sparknlp_jsl.annotator import *

from pyspark.ml import Pipeline, PipelineModel
from sparknlp.training import CoNLL

print('sparknlp.version : ',sparknlp.version())
print('sparknlp_jsl.version : ',sparknlp_jsl.version())

spark

For this exercise, we define two different layers of solutions:
1. Bronze: Text Matcher based rudimentary results - that will be uploaded in annotation lab and refined.
2. Silver: After annotating the documents properly in Annotation Lab, train an NER model, get results

In [0]:

delta_bronze_path='/FileStore/HLS/nlp/delta/bronze/'

dbutils.fs.mkdirs(delta_bronze_path)

os.environ['delta_bronze_path']=f'/dbfs{delta_bronze_path}'

delta_silver_path='/FileStore/HLS/nlp/delta/silver/'

dbutils.fs.mkdirs(delta_silver_path)

os.environ['delta_silver_path']=f'/dbfs{delta_silver_path}'


Download Vocabulary files for Suicide Detection

In [0]:
%sh

cd $delta_bronze_path

wget https://github.com/JohnSnowLabs/spark-nlp-workshop/raw/master/databricks/python/healthcare_case_studies/data/suicide_pred_vocab_data.zip

unzip suicide_pred_vocab_data.zip

In [0]:
data_vocab_path = f"{delta_bronze_path}suicide_pred_vocab_data/"

dbutils.fs.ls(data_vocab_path)

### Running Spark NLP Pipeline

In [0]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols("sentence")\
  .setOutputCol("token")

text_matcher1 = TextMatcher().setInputCols("sentence","token").setOutputCol("textmatch_chunk1").setEntityValue("SUICIDE_BEHAVIOR")\
    .setEntities(data_vocab_path+"suicide_behavior_vocab.txt").setCaseSensitive(False).setMergeOverlapping(True)\
    .setBuildFromTokens(True)

text_matcher2 = TextMatcher().setInputCols("sentence","token").setOutputCol("textmatch_chunk2").setEntityValue("SUICIDE_PSYCHACHE")\
    .setEntities(data_vocab_path+"suicide_psychache_vocab.txt").setCaseSensitive(False).setMergeOverlapping(True)\
    .setBuildFromTokens(True)

text_matcher3 = TextMatcher().setInputCols("sentence","token").setOutputCol("textmatch_chunk3").setEntityValue("SUICIDE_TRAUMA")\
    .setEntities(data_vocab_path+"suicide_trauma_vocab.txt").setCaseSensitive(False).setMergeOverlapping(True)\
    .setBuildFromTokens(True)

text_matcher4 = TextMatcher().setInputCols("sentence","token").setOutputCol("textmatch_chunk4").setEntityValue("SUICIDE_OTHERS")\
    .setEntities(data_vocab_path+"suicide_others_vocab.txt").setCaseSensitive(False).setMergeOverlapping(True)\
    .setBuildFromTokens(True)

chunk_merger = ChunkMergeApproach()\
    .setInputCols(['textmatch_chunk1', 'textmatch_chunk2', 'textmatch_chunk3', 'textmatch_chunk4'])\
    .setOutputCol("all_chunks")

pipeline =  Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        text_matcher1, text_matcher2, text_matcher3, text_matcher4,
        chunk_merger
    ]
)

p_model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']})))

l_model = LightPipeline(p_model)

In [0]:
data = spark.read.csv(data_vocab_path+'suicide_det_data.csv').withColumnRenamed('_c0', 'text')
print (data.count())
data.show(3)


In [0]:
results = p_model.transform(data).collect()

### Results

In [0]:
from sparknlp_display import NerVisualizer

displayHTML(NerVisualizer().display(results[0], 'all_chunks', return_html=True))

## Upload Pre-annotations to Annotation Lab

In [0]:
# annotation lab module
from sparknlp_jsl.alab import AnnotationLab

alab = AnnotationLab()

# Set Credentials
username=''
password=''
client_secret="", 
annotationlab_url=""

alab.set_credentials(

  # required: username
  username=username,

  # required: password
  password=password,

  # required: secret for you alab instance (every alab installation has a different secret)
  client_secret=client_secret,

  # required: http(s) url for you annotation lab
  annotationlab_url=annotationlab_url
)

Convert Predictions to Pre-annotations

In [0]:
pre_annotations, summary = alab.generate_preannotations(all_results = results, document_column = 'document', ner_columns = ['all_chunks'])

In [0]:
## Summary helps identify how many types of entities, assertions, and relations are present in the results - this is useful for setting project config.
summary

Create Project, Set Configuration & Upload

In [0]:
# create new project
alab.create_project('suicide_detection')

In [0]:
# set configuration
alab.set_project_config(
  project_name = 'suicide_detection',
  ner_labels = summary['ner_labels'])

In [0]:
# Upload documents to Alab

alab.upload_preannotations(
  project_name = 'suicide_detection',
  preannotations = pre_annotations[:5]) # testing with 5 annotations

# Annotate Documents on Annotation Lab and import

Import Annotations

In [0]:
alab.get_annotations(
  project_name = 'suicide_detection', 
  output_name='result.json',
  save_dir=delta_silver_path)


In [0]:
dbutils.fs.ls(delta_silver_path)

## Convert Export to Conll file

In [0]:
alab.get_conll_data(spark, f"/dbfs/{delta_silver_path}result.json", output_name='conll_demo', save_dir=f"/dbfs/{delta_silver_path}")

In [0]:
dbutils.fs.ls(delta_silver_path)

## Train NER Model

Load Data

In [0]:
conll_data = CoNLL().readDataset(spark, f"{delta_silver_path}conll_demo.conll")

conll_data.show(3)

In [0]:
clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

In [0]:
graph_folder_path = "/dbfs/ner/medical_ner_graphs"

ner_graph_builder = TFGraphBuilder()\
    .setModelName("ner_dl")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("label")\
    .setGraphFolder(graph_folder_path)\
    .setGraphFile("auto")\
    .setHiddenUnitsNumber(20)\
    .setIsMedical(True) # False -> if you want to use TFGraphBuilder with NerDLApproach

In [0]:
nerTagger = MedicalNerApproach()\
  .setInputCols(["sentence", "token", "embeddings"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(20)\
  .setBatchSize(64)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.5)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setOutputLogsPath('dbfs:/ner/ner_logs')\
  .setUseBestModel(True)\
  .setGraphFolder('dbfs:/ner/medical_ner_graphs')\
  .setValidationSplit(0.2)
  # .setEnableMemoryOptimizer(True) #>> if you have a limited memory and a large conll file, you can set this True to train batch by batch       

ner_pipeline = Pipeline(stages=[
          clinical_embeddings,
          ner_graph_builder,
          nerTagger
 ])

In [0]:
model = ner_pipeline.fit(conll_data)

In [0]:
with open('/dbfs/ner/ner_logs/MedicalNerApproach_376500b42b33.log', 'r') as f_:
  lines = ''.join(f_)
print (lines)