
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Finance Assertion


## Setup

In [2]:
%pip install -q tensorflow==2.7.0
%pip install -q tensorflow-addons

In [4]:
from johnsnowlabs import *

import json
import os

print("Spark NLP Version :", sparknlp.version())

spark = start_spark()

Spark NLP Version : 4.2.1
📋 Loading license number 0 from /home/ubuntu/.johnsnowlabs/licenses/license_number_0_for_.json


22/10/20 11:47:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


👌 Launched [92mcpu-Optimized JVM[39m SparkSession with Jars for: 🚀Spark-NLP==4.2.1, 💊Spark-Healthcare==4.2.0, 🕶Spark-OCR==4.1.0, running on ⚡ PySpark==3.1.2


## Data Prep 

In [6]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/assertion_fin.csv

In [7]:
import pandas as pd

training_df = pd.read_csv('assertion_fin.csv')

In [8]:
training_data = spark.createDataFrame(training_df)
training_data.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+--------------------+---------+-------+--------------------+------+---------------+
|task_id|            sentence|tkn_start|tkn_end|               chunk|entity|assertion_label|
+-------+--------------------+---------+-------+--------------------+------+---------------+
|      1|The Swedish East ...|        1|      4|Swedish East Indi...|   ORG|           PAST|
|      1|The Swedish East ...|        6|      8|Svenska Ostindisk...| ALIAS|           PAST|
|      1|The Swedish East ...|       10|     10|                SOIC| ALIAS|           PAST|
|      1|The Swedish East ...|       14|     14|          Gothenburg|   LOC|           PAST|
|      1|The Swedish East ...|       15|     15|              Sweden|   LOC|           PAST|
|      1|The Swedish East ...|       17|     17|                1731|  DATE|           PAST|
|      1|The Swedish East ...|       25|     25|               China|   LOC|           PAST|
|      1|The Swedish East ...|       28|     29|            Far East| 

                                                                                

In [9]:
training_data.printSchema()

root
 |-- task_id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- tkn_start: long (nullable = true)
 |-- tkn_end: long (nullable = true)
 |-- chunk: string (nullable = true)
 |-- entity: string (nullable = true)
 |-- assertion_label: string (nullable = true)



In [10]:
%time training_data.count()

CPU times: user 2.37 ms, sys: 854 µs, total: 3.22 ms
Wall time: 471 ms


8050

In [11]:
(train_data, test_data) = training_data.randomSplit([0.9, 0.1], seed = 100)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 8050
Test Dataset Count: 797


In [12]:
train_data.show()

+-------+--------------------+---------+-------+--------------------+------+---------------+
|task_id|            sentence|tkn_start|tkn_end|               chunk|entity|assertion_label|
+-------+--------------------+---------+-------+--------------------+------+---------------+
|      1|"Stockholms-varve...|        6|      6|           Stockholm|   LOC|           PAST|
|      1|"The funny busine...|        5|      8|Swedish East Indi...|   ORG|           PAST|
|      1|             (1998).|        0|      0|                1998|  DATE|           PAST|
|      1|2.5 tonnes) and t...|       34|     34|              Sweden|   LOC|           PAST|
|      1|37. Gothenburg: R...|        2|      7|Royal Society of ...|   ORG|           PAST|
|      1|= Decline and fal...|       11|     11|                1806|  DATE|           PAST|
|      1|= Early attempts ...|        9|     11|  Swedish East India|   ORG|           PAST|
|      1|= Early attempts ...|       19|     19|            merchant| 

## Using Bert Embeddings

In [13]:
bert_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base", "en") \
  .setInputCols("document", "token") \
  .setOutputCol("embeddings")\
  .setMaxSentenceLength(512)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[ | ]bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
Download done! Loading the resource.
[ — ]

2022-10-20 11:48:30.249640: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [14]:
document = nlp.DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("document")

chunk = nlp.Doc2Chunk()\
    .setInputCols("document")\
    .setOutputCol("doc_chunk")\
    .setChunkCol("chunk")\
    .setStartCol("tkn_start")\
    .setStartColByTokenIndex(True)\
    .setFailOnMissing(False)\
    .setLowerCase(False)

token = nlp.Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')


We save the test data in parquet format to use in `AssertionDLApproach()`. 

In [15]:
assertion_pipeline = Pipeline(
    stages = [
    document,
    chunk,
    token,
    bert_embeddings])

assertion_test_data = assertion_pipeline.fit(test_data).transform(test_data)

In [16]:
assertion_test_data.columns

['task_id',
 'sentence',
 'tkn_start',
 'tkn_end',
 'chunk',
 'entity',
 'assertion_label',
 'document',
 'doc_chunk',
 'token',
 'embeddings']

In [17]:
assertion_test_data.write.mode('overwrite').parquet('test_data.parquet')

                                                                                

In [18]:
assertion_train_data = assertion_pipeline.fit(training_data).transform(training_data)

assertion_train_data.write.mode('overwrite').parquet('train_data.parquet')

                                                                                

In [19]:
assertion_train_data.columns

['task_id',
 'sentence',
 'tkn_start',
 'tkn_end',
 'chunk',
 'entity',
 'assertion_label',
 'document',
 'doc_chunk',
 'token',
 'embeddings']

## Graph setup

We will use TFGraphBuilder annotator which can be used to create graphs in the model training pipeline. 

TFGraphBuilder inspects the data and creates the proper graph if a suitable version of TensorFlow (<= 2.7 ) is available. The graph is stored in the defined folder and loaded by the approach.

In [20]:
graph_folder= "./tf_graphs"

In [21]:
assertion_graph_builder =  finance.TFGraphBuilder()\
    .setModelName("assertion_dl")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("assertion_label")\
    .setGraphFolder(graph_folder)\
    .setGraphFile("assertion_graph.pb")\
    .setMaxSequenceLength(1200)\
    .setHiddenUnitsNumber(25)

**Setting the Scope Window (Target Area) Dynamically in Assertion Status Detection Models**


This parameter allows you to train the Assertion Status Models to focus on specific context windows when resolving the status of a NER chunk. The window is in format `[X,Y]` being `X` the number of tokens to consider on the left of the chunk, and `Y` the max number of tokens to consider on the right. Let’s take a look at what different windows mean:


*   By default, the window is `[-1,-1]` which means that the Assertion Status will look at all of the tokens in the sentence/document (up to a maximum of tokens set in `setMaxSentLen()` ).
*   `[0,0]` means “don’t pay attention to any token except the ner_chunk”, what basically is not considering any context for the Assertion resolution.
*   `[9,15]` is what empirically seems to be the best baseline, meaning that we look up to 9 tokens on the left and 15 on the right of the ner chunk to understand the context and resolve the status.


Check this [Scope Window Tuning Assertion Status Detection notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/2.1.Scope_window_tuning_assertion_status_detection.ipynb)  that illustrates the effect of the different windows and how to properly fine-tune your AssertionDLModels to get the best of them.

In our case, the best Scope Window is around [10,10]

In [22]:
scope_window = [50, 50]

assertionStatus = finance.AssertionDLApproach()\
    .setLabelCol("assertion_label")\
    .setInputCols("document", "doc_chunk", "embeddings")\
    .setOutputCol("assertion")\
    .setBatchSize(128)\
    .setLearningRate(0.001)\
    .setEpochs(2)\
    .setStartCol("tkn_start")\
    .setEndCol("tkn_end")\
    .setMaxSentLen(1200)\
    .setEnableOutputLogs(True)\
    .setOutputLogsPath('training_logs/')\
    .setGraphFolder(graph_folder)\
    .setGraphFile(f"{graph_folder}/assertion_graph.pb")\
    .setTestDataset(path="test_data.parquet", read_as='SPARK', options={'format': 'parquet'})\
    .setScopeWindow(scope_window)
    #.setValidationSplit(0.2)\    
    #.setDropout(0.1)\    

In [23]:
clinical_assertion_pipeline = Pipeline(
    stages = [
    #document,
    #chunk,
    #token,
    #embeddings,
    assertion_graph_builder,
    assertionStatus])

In [24]:
training_data.printSchema()

root
 |-- task_id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- tkn_start: long (nullable = true)
 |-- tkn_end: long (nullable = true)
 |-- chunk: string (nullable = true)
 |-- entity: string (nullable = true)
 |-- assertion_label: string (nullable = true)



In [25]:
assertion_train_data = spark.read.parquet('train_data.parquet')

In [26]:
%%time
assertion_model = clinical_assertion_pipeline.fit(assertion_train_data)

TF Graph Builder configuration:
Model name: assertion_dl
Graph folder: ./tf_graphs
Graph file name: assertion_graph.pb
Build params: {'n_classes': 4, 'feat_size': 768, 'max_seq_len': 1200, 'n_hidden': 25}


2022-10-20 11:52:18.991689: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/ubuntu/.jupyter_env/lib/python3.8/site-packages/cv2/../../lib64:
2022-10-20 11:52:18.991733: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
non-resource variables are not supported in the long term


Device mapping: no known devices.


2022-10-20 11:52:27.217409: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-20 11:52:27.249660: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-10-20 11:52:27.249696: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jsl-test): /proc/driver/nvidia/version does not exist
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype a

Device mapping: no known devices.
assertion_dl graph exported to ./tf_graphs/assertion_graph.pb


                                                                                

Quality on test dataset: 
time to finish evaluation: 43.08s
Total test loss: 2.1367	Avg test loss: 0.3052
label	 tp	 fp	 fn	 prec	 rec	 f1
PRESENT	 179	 17	 38	 0.9132653	 0.8248848	 0.866828
POSSIBLE	 154	 10	 23	 0.9390244	 0.8700565	 0.90322584
FUTURE	 109	 19	 14	 0.8515625	 0.88617885	 0.8685259
PAST	 266	 43	 14	 0.86084145	 0.95	 0.90322584
tp: 708 fp: 89 fn: 89 labels: 4
Macro-average	 prec: 0.8911734, rec: 0.88278, f1: 0.8869569
Micro-average	 prec: 0.88833123, rec: 0.88833123, f1: 0.88833123
Quality on test dataset: 
time to finish evaluation: 42.70s
Total test loss: 1.5315	Avg test loss: 0.2188
label	 tp	 fp	 fn	 prec	 rec	 f1
PRESENT	 200	 25	 17	 0.8888889	 0.921659	 0.9049774
POSSIBLE	 156	 0	 21	 1.0	 0.88135594	 0.9369369
FUTURE	 118	 12	 5	 0.9076923	 0.9593496	 0.9328064
PAST	 261	 25	 19	 0.9125874	 0.93214285	 0.9222615
tp: 735 fp: 62 fn: 62 labels: 4
Macro-average	 prec: 0.92729217, rec: 0.9236269, f1: 0.925456
Micro-average	 prec: 0.9222083, rec: 0.9222083, f1: 0.

Checking the results saved in the log file

In [27]:
import os

log_files = os.listdir("training_logs")
log_files

['AssertionDLApproach_deacaf05b4cf.log']

In [28]:
with open("training_logs/"+log_files[0]) as log_file:
    print(log_file.read())

Name of the selected graph: ./tf_graphs/assertion_graph.pb
Training started, trainExamples: 8050


Epoch: 0 started, learning rate: 0.001, dataset size: 8050
Done, 488.44619858 total training loss: 51.923687, avg training loss: 0.8241855, batches: 63
Quality on test dataset: 
time to finish evaluation: 43.08s
Total test loss: 2.1367	Avg test loss: 0.3052
label	 tp	 fp	 fn	 prec	 rec	 f1
PRESENT	 179	 17	 38	 0.9132653	 0.8248848	 0.866828
POSSIBLE	 154	 10	 23	 0.9390244	 0.8700565	 0.90322584
FUTURE	 109	 19	 14	 0.8515625	 0.88617885	 0.8685259
PAST	 266	 43	 14	 0.86084145	 0.95	 0.90322584
tp: 708 fp: 89 fn: 89 labels: 4
Macro-average	 prec: 0.8911734, rec: 0.88278, f1: 0.8869569
Micro-average	 prec: 0.88833123, rec: 0.88833123, f1: 0.88833123


Epoch: 1 started, learning rate: 9.5E-4, dataset size: 8050
Done, 485.647417029 total training loss: 18.909857, avg training loss: 0.30015644, batches: 63
Quality on test dataset: 
time to finish evaluation: 42.70s
Total test loss: 1.5315	A

In [29]:
assertion_test_data = spark.read.parquet('test_data.parquet')

In [30]:
preds = assertion_model.transform(assertion_test_data).select('assertion_label','assertion.result')

preds.show()

[Stage 20:>                                                         (0 + 1) / 1]

+---------------+---------+
|assertion_label|   result|
+---------------+---------+
|           PAST|[PRESENT]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|[PRESENT]|
|        PRESENT|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|[PRESENT]|
|        PRESENT|[PRESENT]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|[PRESENT]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|   [PAST]|
+---------------+---------+
only showing top 20 rows



                                                                                

In [31]:
preds_df = preds.toPandas()

                                                                                

In [32]:
preds_df["result"] = preds_df["result"].apply(lambda x: x[0] if len(x) else pd.NA)
preds_df.dropna(inplace=True)

preds_df

Unnamed: 0,assertion_label,result
0,PAST,PRESENT
1,PAST,PAST
2,PAST,PAST
3,PAST,PAST
4,PAST,PAST
...,...,...
792,POSSIBLE,PRESENT
793,POSSIBLE,FUTURE
794,POSSIBLE,POSSIBLE
795,POSSIBLE,POSSIBLE


In [33]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report( preds_df['assertion_label'], preds_df['result']))

              precision    recall  f1-score   support

      FUTURE       0.91      0.96      0.93       123
        PAST       0.91      0.93      0.92       280
    POSSIBLE       1.00      0.88      0.94       177
     PRESENT       0.89      0.92      0.90       217

    accuracy                           0.92       797
   macro avg       0.93      0.92      0.92       797
weighted avg       0.92      0.92      0.92       797



## Saving the trained model

In [34]:
assertion_model.stages

[TFGraphBuilderModel_6bab2a80f9b1, FINANCE-ASSERTION_DL_1b89e8669aaf]

In [35]:
# Save a Spark NLP model
assertion_model.stages[-1].write().overwrite().save('Assertion')

import shutil
shutil.make_archive('Assertion', 'zip', 'Assertion')

'/home/ubuntu/notebooks/examples/finance/Assertion.zip'