
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Finance Assertion


## Setup

In [None]:
%pip install -q tensorflow==2.7.0
%pip install -q tensorflow-addons

In [None]:
from johnsnowlabs import *

import pandas as pd
import json
import os

spark = start_spark()

## Data Prep 

In [2]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings_JSL/Finance/data/assertion_fin.csv

In [3]:
training_df = pd.read_csv('assertion_fin.csv')

In [4]:
training_data = spark.createDataFrame(training_df)
training_data.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+--------------------+---------+-------+--------------------+------+---------------+
|task_id|            sentence|tkn_start|tkn_end|               chunk|entity|assertion_label|
+-------+--------------------+---------+-------+--------------------+------+---------------+
|      1|The Swedish East ...|        1|      4|Swedish East Indi...|   ORG|           PAST|
|      1|The Swedish East ...|        6|      8|Svenska Ostindisk...| ALIAS|           PAST|
|      1|The Swedish East ...|       10|     10|                SOIC| ALIAS|           PAST|
|      1|The Swedish East ...|       14|     14|          Gothenburg|   LOC|           PAST|
|      1|The Swedish East ...|       15|     15|              Sweden|   LOC|           PAST|
|      1|The Swedish East ...|       17|     17|                1731|  DATE|           PAST|
|      1|The Swedish East ...|       25|     25|               China|   LOC|           PAST|
|      1|The Swedish East ...|       28|     29|            Far East| 

                                                                                

In [6]:
training_data.printSchema()

root
 |-- task_id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- tkn_start: long (nullable = true)
 |-- tkn_end: long (nullable = true)
 |-- chunk: string (nullable = true)
 |-- entity: string (nullable = true)
 |-- assertion_label: string (nullable = true)



In [7]:
%time training_data.count()

CPU times: user 1.38 ms, sys: 483 µs, total: 1.87 ms
Wall time: 449 ms


8050

In [8]:
(train_data, test_data) = training_data.randomSplit([0.9, 0.1], seed = 100)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 8050
Test Dataset Count: 797


In [9]:
train_data.show()

+-------+--------------------+---------+-------+--------------------+------+---------------+
|task_id|            sentence|tkn_start|tkn_end|               chunk|entity|assertion_label|
+-------+--------------------+---------+-------+--------------------+------+---------------+
|      1|"Stockholms-varve...|        6|      6|           Stockholm|   LOC|           PAST|
|      1|"The funny busine...|        5|      8|Swedish East Indi...|   ORG|           PAST|
|      1|             (1998).|        0|      0|                1998|  DATE|           PAST|
|      1|2.5 tonnes) and t...|       34|     34|              Sweden|   LOC|           PAST|
|      1|37. Gothenburg: R...|        2|      7|Royal Society of ...|   ORG|           PAST|
|      1|= Decline and fal...|       11|     11|                1806|  DATE|           PAST|
|      1|= Early attempts ...|        9|     11|  Swedish East India|   ORG|           PAST|
|      1|= Early attempts ...|       19|     19|            merchant| 

## Using Bert Embeddings

In [10]:
bert_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base", "en") \
  .setInputCols("document", "token") \
  .setOutputCol("embeddings")\
  .setMaxSentenceLength(512)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[ | ]bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
Download done! Loading the resource.
[ \ ]

2022-12-12 17:06:12.257565: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [11]:
document = nlp.DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("document")

chunk = nlp.Doc2Chunk()\
    .setInputCols("document")\
    .setOutputCol("doc_chunk")\
    .setChunkCol("chunk")\
    .setStartCol("tkn_start")\
    .setStartColByTokenIndex(True)\
    .setFailOnMissing(False)\
    .setLowerCase(False)

token = nlp.Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')


We save the test data in parquet format to use in `AssertionDLApproach()`. 

In [12]:
assertion_pipeline = nlp.Pipeline(
    stages = [
    document,
    chunk,
    token,
    bert_embeddings])

assertion_train_data = assertion_pipeline.fit(training_data).transform(training_data)
assertion_test_data = assertion_pipeline.fit(test_data).transform(test_data)

In [13]:
assertion_test_data.columns

['task_id',
 'sentence',
 'tkn_start',
 'tkn_end',
 'chunk',
 'entity',
 'assertion_label',
 'document',
 'doc_chunk',
 'token',
 'embeddings']

In [14]:
assertion_train_data.write.mode('overwrite').parquet('train_data.parquet')
assertion_test_data.write.mode('overwrite').parquet('test_data.parquet')

                                                                                

In [16]:
assertion_train_data.columns

['task_id',
 'sentence',
 'tkn_start',
 'tkn_end',
 'chunk',
 'entity',
 'assertion_label',
 'document',
 'doc_chunk',
 'token',
 'embeddings']

## Graph setup

We will use TFGraphBuilder annotator which can be used to create graphs in the model training pipeline. 

TFGraphBuilder inspects the data and creates the proper graph if a suitable version of TensorFlow (<= 2.7 ) is available. The graph is stored in the defined folder and loaded by the approach.

In [17]:
graph_folder= "./tf_graphs"

In [18]:
assertion_graph_builder =  finance.TFGraphBuilder()\
    .setModelName("assertion_dl")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("assertion_label")\
    .setGraphFolder(graph_folder)\
    .setGraphFile("assertion_graph.pb")\
    .setMaxSequenceLength(1200)\
    .setHiddenUnitsNumber(25)

**Setting the Scope Window (Target Area) Dynamically in Assertion Status Detection Models**


This parameter allows you to train the Assertion Status Models to focus on specific context windows when resolving the status of a NER chunk. The window is in format `[X,Y]` being `X` the number of tokens to consider on the left of the chunk, and `Y` the max number of tokens to consider on the right. Let’s take a look at what different windows mean:


*   By default, the window is `[-1,-1]` which means that the Assertion Status will look at all of the tokens in the sentence/document (up to a maximum of tokens set in `setMaxSentLen()` ).
*   `[0,0]` means “don’t pay attention to any token except the ner_chunk”, what basically is not considering any context for the Assertion resolution.
*   `[9,15]` is what empirically seems to be the best baseline, meaning that we look up to 9 tokens on the left and 15 on the right of the ner chunk to understand the context and resolve the status.


Check this [Scope Window Tuning Assertion Status Detection notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/2.1.Scope_window_tuning_assertion_status_detection.ipynb)  that illustrates the effect of the different windows and how to properly fine-tune your AssertionDLModels to get the best of them.

In our case, the best Scope Window is around [10,10]

In [19]:
scope_window = [50, 50]

assertionStatus = finance.AssertionDLApproach()\
    .setLabelCol("assertion_label")\
    .setInputCols("document", "doc_chunk", "embeddings")\
    .setOutputCol("assertion")\
    .setBatchSize(128)\
    .setLearningRate(0.001)\
    .setEpochs(2)\
    .setStartCol("tkn_start")\
    .setEndCol("tkn_end")\
    .setMaxSentLen(1200)\
    .setEnableOutputLogs(True)\
    .setOutputLogsPath('training_logs/')\
    .setGraphFolder(graph_folder)\
    .setGraphFile(f"{graph_folder}/assertion_graph.pb")\
    .setTestDataset(path="test_data.parquet", read_as='SPARK', options={'format': 'parquet'})\
    .setScopeWindow(scope_window)
    #.setValidationSplit(0.2)\    
    #.setDropout(0.1)\    

In [20]:
assertion_pipeline = nlp.Pipeline(
    stages = [
    #document,
    #chunk,
    #token,
    #embeddings,
    assertion_graph_builder,
    assertionStatus])

In [21]:
training_data.printSchema()

root
 |-- task_id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- tkn_start: long (nullable = true)
 |-- tkn_end: long (nullable = true)
 |-- chunk: string (nullable = true)
 |-- entity: string (nullable = true)
 |-- assertion_label: string (nullable = true)



In [22]:
assertion_train_data = spark.read.parquet('train_data.parquet')

In [None]:
%%time
assertion_model = assertion_pipeline.fit(assertion_train_data)

Checking the results saved in the log file

In [24]:
log_files = os.listdir("training_logs")
log_files

['.ipynb_checkpoints', 'AssertionDLApproach_fed7d5b19726.log']

In [26]:
with open("training_logs/"+log_files[1]) as log_file:
    print(log_file.read())

Name of the selected graph: ./tf_graphs/assertion_graph.pb
Training started, trainExamples: 8050


Epoch: 0 started, learning rate: 0.001, dataset size: 8050
Done, 438.977498886 total training loss: 69.66348, avg training loss: 1.1057695, batches: 63
Quality on test dataset: 
time to finish evaluation: 38.62s
Total test loss: 3.1915	Avg test loss: 0.4559
label	 tp	 fp	 fn	 prec	 rec	 f1
PRESENT	 172	 29	 45	 0.8557214	 0.79262674	 0.8229665
POSSIBLE	 158	 31	 19	 0.83597887	 0.8926554	 0.863388
FUTURE	 71	 18	 52	 0.7977528	 0.57723576	 0.6698113
PAST	 259	 59	 21	 0.8144654	 0.925	 0.8662207
tp: 660 fp: 137 fn: 137 labels: 4
Macro-average	 prec: 0.8259796, rec: 0.7968795, f1: 0.8111686
Micro-average	 prec: 0.8281054, rec: 0.8281054, f1: 0.8281054


Epoch: 1 started, learning rate: 9.5E-4, dataset size: 8050
Done, 433.920165312 total training loss: 25.254559, avg training loss: 0.400866, batches: 63
Quality on test dataset: 
time to finish evaluation: 38.13s
Total test loss: 1.8017	Avg

In [27]:
assertion_test_data = spark.read.parquet('test_data.parquet')

In [None]:
preds = assertion_model.transform(assertion_test_data).select('assertion_label','assertion.result')


In [31]:
preds.show()

[Stage 22:>                                                         (0 + 1) / 1]

+---------------+---------+
|assertion_label|   result|
+---------------+---------+
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|   [PAST]|
|           PAST|[PRESENT]|
|        PRESENT|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|[PRESENT]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|[PRESENT]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|           PAST|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|   [PAST]|
|        PRESENT|   [PAST]|
+---------------+---------+
only showing top 20 rows



                                                                                

In [32]:
preds_df = preds.toPandas()

                                                                                

In [33]:
preds_df["result"] = preds_df["result"].apply(lambda x: x[0] if len(x) else pd.NA)
preds_df.dropna(inplace=True)

preds_df

Unnamed: 0,assertion_label,result
0,PAST,PAST
1,PAST,PAST
2,PAST,PAST
3,PAST,PAST
4,PAST,PAST
...,...,...
792,POSSIBLE,POSSIBLE
793,POSSIBLE,POSSIBLE
794,POSSIBLE,POSSIBLE
795,POSSIBLE,POSSIBLE


In [34]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report( preds_df['assertion_label'], preds_df['result']))

              precision    recall  f1-score   support

      FUTURE       0.94      0.83      0.88       123
        PAST       0.86      0.97      0.91       280
    POSSIBLE       0.96      0.92      0.94       177
     PRESENT       0.92      0.86      0.89       217

    accuracy                           0.91       797
   macro avg       0.92      0.90      0.91       797
weighted avg       0.91      0.91      0.91       797



## Saving the trained model

In [35]:
assertion_model.stages

[TFGraphBuilderModel_9a578490229e, FINANCE-ASSERTION_DL_f538aa3e83c3]

In [32]:
# Save a Spark NLP model
assertion_model.stages[-1].write().overwrite().save('Assertion')

import shutil
shutil.make_archive('Assertion', 'zip', 'Assertion')

'/home/ubuntu/notebooks/examples/finance/Assertion.zip'