![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.4.Resume_RelationExtractionApproach_Training.ipynb)

# 10.4 Resume RelationExtractionApproach Model Training

Steps:
- Train a new model for a few epochs.
- Load the same model and train for more epochs on the same taxnonomy, and evaluate.
- Train a model already trained on a different dataset.

## Colab Setup

In [None]:
# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
! pip install -q johnsnowlabs

In [None]:
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()

In [None]:
from johnsnowlabs import nlp, medical, visual

# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
nlp.install()

In [None]:
from johnsnowlabs import nlp, medical, visual
import pandas as pd
import json
import string
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Automatically load license data and start a session with all jars user has access to

spark = nlp.start()

In [None]:
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark.sql as SQL
from pyspark import keyword_only

## Download Data for Training (NCBI Disease Dataset)

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/i2b2_clinical_rel_dataset.csv

In [None]:
data = spark.read.option("header","true").format("csv").load("i2b2_clinical_rel_dataset.csv")

data = data.select( 'sentence','firstCharEnt1','firstCharEnt2','lastCharEnt1','lastCharEnt2', "chunk1", "chunk2", "label1", "label2",'rel','dataset')

data.show(10)

# you only need these columns>> 'sentence','firstCharEnt1','firstCharEnt2','lastCharEnt1','lastCharEnt2', "chunk1", "chunk2", "label1", "label2",'rel'
# ('dataset' column is optional)

+--------------------+-------------+-------------+------------+------------+--------------------+--------------------+---------+---------+-----+-------+
|            sentence|firstCharEnt1|firstCharEnt2|lastCharEnt1|lastCharEnt2|              chunk1|              chunk2|   label1|   label2|  rel|dataset|
+--------------------+-------------+-------------+------------+------------+--------------------+--------------------+---------+---------+-----+-------+
|VITAL SIGNS - Tem...|           49|           75|          64|          84|    respiratory rate|          saturation|     test|     test|    O|   test|
|No lotions , crea...|            3|           34|           9|          42|             lotions|           incisions|treatment|  problem|TrNAP|   test|
|Because of expect...|           11|           58|          54|          68|expected long ter...|         a picc line|treatment|treatment|    O|  train|
|She states this l...|           16|           82|          31|          92|    li

In [None]:
data.groupby('dataset').count().show()

+-------+-----+
|dataset|count|
+-------+-----+
|  train|  350|
|   test|  650|
+-------+-----+



In [None]:
data.groupby('rel').count().show()

+-----+-----+
|  rel|count|
+-----+-----+
| TrIP|   14|
| TrAP|  164|
| TeCP|   26|
|    O|  414|
|TrNAP|   14|
| TrCP|   28|
|  PIP|  153|
| TrWP|   11|
| TeRP|  176|
+-----+-----+



In [None]:
#Annotation structure
annotationType = T.StructType([
            T.StructField('annotatorType', T.StringType(), False),
            T.StructField('begin', T.IntegerType(), False),
            T.StructField('end', T.IntegerType(), False),
            T.StructField('result', T.StringType(), False),
            T.StructField('metadata', T.MapType(T.StringType(), T.StringType()), False),
            T.StructField('embeddings', T.ArrayType(T.FloatType()), False)
        ])

#UDF function to convert train data to names entitities

@F.udf(T.ArrayType(annotationType))
def createTrainAnnotations(begin1, end1, begin2, end2, chunk1, chunk2, label1, label2):
    
    entity1 = nlp.annotation.Annotation("chunk", begin1, end1, chunk1, {'entity': label1.upper(), 'sentence': '0'}, [])
    entity2 = nlp.annotation.Annotation("chunk", begin2, end2, chunk2, {'entity': label2.upper(), 'sentence': '0'}, [])    
        
    entity1.annotatorType = "chunk"
    entity2.annotatorType = "chunk"

    return [entity1, entity2]    

#list of valid relations
rels = ["TrIP", "TrAP", "TeCP", "TrNAP", "TrCP", "PIP", "TrWP", "TeRP"]

#a query to select list of valid relations
#valid_rel_query = "(" + " OR ".join(["rel = '{}'".format(rel) for rel in rels]) + ")"
#.where(valid_rel_query)\

data = data\
  .withColumn("begin1i", F.expr("cast(firstCharEnt1 AS Int)"))\
  .withColumn("end1i", F.expr("cast(lastCharEnt1 AS Int)"))\
  .withColumn("begin2i", F.expr("cast(firstCharEnt2 AS Int)"))\
  .withColumn("end2i", F.expr("cast(lastCharEnt2 AS Int)"))\
  .where("begin1i IS NOT NULL")\
  .where("end1i IS NOT NULL")\
  .where("begin2i IS NOT NULL")\
  .where("end2i IS NOT NULL")\
  .withColumn(
      "train_ner_chunks", 
      createTrainAnnotations(
          "begin1i", "end1i", "begin2i", "end2i", "chunk1", "chunk2", "label1", "label2"
      ).alias("train_ner_chunks", metadata={'annotatorType': "chunk"}))
    
@F.udf(T.StringType())
def encodeRelationDirection(rel, begin1, begin2):
    if rel != "O":
        if begin1 > begin2:
            return "leftwards"
        else:
            return "rightwards"
    else:
        return "both"

data = data.withColumn("rel_dir", encodeRelationDirection("rel", "begin1i", "begin2i"))

train_data = data.where("dataset='train'")
test_data = data.where("dataset='test'")

## Split the test data into two parts:
- We Keep the first part separate and use it for training the model further, as it will be totally unseen data from the same taxonomy.

- The second part will be used to testing and evaluating

In [None]:
(test_data_1, test_data_2) = test_data.randomSplit([0.5, 0.5], seed = 100)

## Train a new model, pause, and resume training on the same dataset.

### Create graph 

We will use `TFGraphBuilder` annotator which can be used to create graphs automatically in the model training pipeline. 

`TFGraphBuilder` inspects the data and creates the proper graph if a suitable version of TensorFlow is available. The graph is stored in the defined folder and loaded by the approach.

You can also create a custom graph by using `tf_graph` module in Spark NLP for Healthcare.

In [None]:
!pip install -q tensorflow==2.7.0 tensorflow-addons

In [None]:
#from sparknlp_jsl.annotator import TFGraphBuilder

graph_folder= "./tf_graphs"

re_graph_builder = medical.TFGraphBuilder()\
    .setModelName("relation_extraction")\
    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"]) \
    .setLabelColumn("rel")\
    .setGraphFolder(graph_folder)\
    .setGraphFile("re_graph.pb")\
    .setHiddenLayers([300, 200])\
    .setHiddenAct("relu")\
    .setHiddenActL2(True)\
    .setHiddenWeightsL2(False)\
    .setBatchNorm(False)

### Train for 30 epochs

In [None]:
documenter = nlp.DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("sentences")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("tokens")\

words_embedder = nlp.WordEmbeddingsModel()\
    .pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("embeddings")

pos_tagger = nlp.PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("pos_tags")
    
dependency_parser = nlp.DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentences", "pos_tags", "tokens"])\
    .setOutputCol("dependencies")

reApproach = medical.RelationExtractionApproach()\
    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"])\
    .setOutputCol("relations")\
    .setLabelColumn("rel")\
    .setEpochsNumber(30)\
    .setBatchSize(200)\
    .setDropout(0.5)\
    .setLearningRate(0.001)\
    .setModelFile(f"{graph_folder}/re_graph.pb")\
    .setFixImbalance(True)\
    .setFromEntity("begin1i", "end1i", "label1")\
    .setToEntity("begin2i", "end2i", "label2")\
    .setOutputLogsPath('/content')\
    .setRelationDirectionCol("rel_dir")

finisher = nlp.Finisher()\
    .setInputCols(["relations"])\
    .setOutputCols(["relations_out"])\
    .setCleanAnnotations(False)\
    .setValueSplitSymbol(",")\
    .setAnnotationSplitSymbol(",")\
    .setOutputAsArray(False)

train_pipeline = nlp.Pipeline(stages=[
    documenter, 
    tokenizer, 
    words_embedder, 
    pos_tagger, 
    dependency_parser, 
    re_graph_builder,
    reApproach, 
    finisher
])

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.5 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]


In [None]:
%%time 
rel_model = train_pipeline.fit(train_data)

TF Graph Builder configuration:
Model name: relation_extraction
Graph folder: ./tf_graphs
Graph file name: re_graph.pb
Build params: {'input_dim': 1149, 'output_dim': 27, 'hidden_layers': [300, 200], 'hidden_act': 'relu', 'hidden_act_l2': True, 'hidden_weights_l2': False, 'batch_norm': False}
relation_extraction graph exported to ./tf_graphs/re_graph.pb
CPU times: user 3.14 s, sys: 625 ms, total: 3.76 s
Wall time: 15.8 s


In [None]:
result = rel_model.transform(test_data_2)

### Evaluate

In [None]:
results_without_dir = result\
    .selectExpr(
        "rel", 
        "INT(rel == relations.result[0]) AS acc1", 
        "INT(lower(chunk1) == lower(relations.metadata[0].chunk1)) AS acc2",)\
    .groupBy("rel")\
    .agg(F.avg("acc1").alias("mACC1"), F.avg("acc2").alias("mACC2"), F.count("rel").alias("support"))\
    .selectExpr(
        "rel", 
        "round(mACC1, 2) AS Rel_ACC", 
        "round(mACC2, 2) AS Arg_ACC",
        "round(mACC1 * mACC2, 2) AS ACC",
        "support")

results_without_dir.show()

### Save to disk

In [None]:
rel_model.stages[-2].write().overwrite().save('RE_model_30e')

### Train using the saved model on unseen dataset

We use unseen data from the same taxonomy

In [None]:
reApproach_finetune = medical.RelationExtractionApproach()\
    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"])\
    .setOutputCol("relations")\
    .setLabelColumn("rel")\
    .setEpochsNumber(30)\
    .setBatchSize(200)\
    .setDropout(0.5)\
    .setLearningRate(0.001)\
    .setFixImbalance(True)\
    .setFromEntity("begin1i", "end1i", "label1")\
    .setToEntity("begin2i", "end2i", "label2")\
    .setRelationDirectionCol("rel_dir")\
    .setPretrainedModelPath("RE_model_30e")\
    .setОverrideExistingLabels(False)

finetune_pipeline = nlp.Pipeline(stages=[
    documenter, 
    tokenizer, 
    words_embedder, 
    pos_tagger, 
    dependency_parser, 
    re_graph_builder,
    reApproach_finetune, 
    finisher
])

In [None]:
%%time 
rel_model = finetune_pipeline.fit(test_data_1)

TF Graph Builder configuration:
Model name: relation_extraction
Graph folder: ./tf_graphs
Graph file name: re_graph.pb
Build params: {'input_dim': 1149, 'output_dim': 27, 'hidden_layers': [300, 200], 'hidden_act': 'relu', 'hidden_act_l2': True, 'hidden_weights_l2': False, 'batch_norm': False}
relation_extraction graph exported to ./tf_graphs/re_graph.pb
CPU times: user 901 ms, sys: 56.4 ms, total: 957 ms
Wall time: 8.78 s


In [None]:
result = rel_model.transform(test_data_2)

### Evaluate

In [None]:
results_without_dir = result\
    .selectExpr(
        "rel", 
        "INT(rel == relations.result[0]) AS acc1", 
        "INT(lower(chunk1) == lower(relations.metadata[0].chunk1)) AS acc2",)\
    .groupBy("rel")\
    .agg(F.avg("acc1").alias("mACC1"), F.avg("acc2").alias("mACC2"), F.count("rel").alias("support"))\
    .selectExpr(
        "rel", 
        "round(mACC1, 2) AS Rel_ACC", 
        "round(mACC2, 2) AS Arg_ACC",
        "round(mACC1 * mACC2, 2) AS ACC",
        "support")

results_without_dir.show()

### Save to disk

In [None]:
rel_model.stages[-2].write().overwrite().save('RE_model_finetuned')

## Now let's take a model trained on a different dataset and train on this dataset

In [None]:
clinical_re_Model = medical.RelationExtractionModel()\
    .pretrained("re_clinical", "en", 'clinical/models')\
    .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\
    .setOutputCol("relations")

clinical_re_Model.getClasses()

re_clinical download started this may take some time.
Approximate size to download 6 MB
[OK!]


['TrWP', 'TrNAP', 'TrCP', 'PIP', 'TeCP', 'TeRP', 'TrIP', 'TrAP', 'O']

### Now train a model using this model as base

In [None]:
reApproach_finetune = medical.RelationExtractionApproach()\
    .setInputCols(["embeddings", "pos_tags", "train_ner_chunks", "dependencies"])\
    .setOutputCol("relations")\
    .setLabelColumn("rel")\
    .setEpochsNumber(30)\
    .setBatchSize(200)\
    .setDropout(0.5)\
    .setLearningRate(0.001)\
    .setFixImbalance(True)\
    .setFromEntity("begin1i", "end1i", "label1")\
    .setToEntity("begin2i", "end2i", "label2")\
    .setPretrainedModelPath("/root/cache_pretrained/re_clinical_en_2.5.5_2.4_1596928426753")\
    .setОverrideExistingLabels(False)

finetune_pipeline = nlp.Pipeline(stages=[
    documenter, 
    tokenizer, 
    words_embedder, 
    pos_tagger, 
    dependency_parser, 
    re_graph_builder,
    reApproach_finetune, 
    finisher
])

In [None]:
%%time
rel_model = finetune_pipeline.fit(train_data)

TF Graph Builder configuration:
Model name: relation_extraction
Graph folder: ./tf_graphs
Graph file name: re_graph.pb
Build params: {'input_dim': 1149, 'output_dim': 27, 'hidden_layers': [300, 200], 'hidden_act': 'relu', 'hidden_act_l2': True, 'hidden_weights_l2': False, 'batch_norm': False}
relation_extraction graph exported to ./tf_graphs/re_graph.pb
CPU times: user 751 ms, sys: 53.8 ms, total: 804 ms
Wall time: 6.82 s


In [None]:
result = rel_model.transform(test_data_2)

### Evaluate

In [None]:
results_without_dir = result\
    .selectExpr(
        "rel", 
        "INT(rel == relations.result[0]) AS acc1", 
        "INT(lower(chunk1) == lower(relations.metadata[0].chunk1)) AS acc2",)\
    .groupBy("rel")\
    .agg(F.avg("acc1").alias("mACC1"), F.avg("acc2").alias("mACC2"), F.count("rel").alias("support"))\
    .selectExpr(
        "rel", 
        "round(mACC1, 2) AS Rel_ACC", 
        "round(mACC2, 2) AS Arg_ACC",
        "round(mACC1 * mACC2, 2) AS ACC",
        "support")

results_without_dir.show()

### Save to disk

In [None]:
rel_model.stages[-2].write().overwrite().save('RE_pretrained_model_finetuned')