
![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)





# Train Finance Assertion


## Setup

In [None]:
from johnsnowlabs import *

import pandas as pd
import json
import os

spark = start_spark()

## Data Prep 

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/assertion_df.csv

In [None]:
training_df = pd.read_csv('./assertion_df.csv')

In [None]:
training_data = spark.createDataFrame(training_df)
training_data.show()

In [None]:
training_data.printSchema()

In [None]:
%time training_data.count()

In [None]:
(train_data, test_data) = training_data.randomSplit([0.9, 0.1], seed = 100)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

In [None]:
train_data.show()

## Using Bert Embeddings

In [None]:
bert_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base", "en") \
  .setInputCols("document", "token") \
  .setOutputCol("embeddings")\
  .setMaxSentenceLength(512)

In [None]:
document = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

chunk = nlp.Doc2Chunk()\
    .setInputCols("document")\
    .setOutputCol("doc_chunk")\
    .setChunkCol("target")\
    .setStartCol("start")\
    .setStartColByTokenIndex(True)\
    .setFailOnMissing(False)\
    .setLowerCase(False)

token = nlp.Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

     

We save the test data in parquet format to use in `AssertionDLApproach()`. 

In [None]:
assertion_pipeline = nlp.Pipeline(
    stages = [
    document,
    chunk,
    token,
    bert_embeddings])

assertion_train_data = assertion_pipeline.fit(training_data).transform(training_data)
assertion_test_data = assertion_pipeline.fit(test_data).transform(test_data)

In [None]:
assertion_test_data.columns

In [None]:
assertion_train_data.write.mode('overwrite').parquet('train_data.parquet')
assertion_test_data.write.mode('overwrite').parquet('test_data.parquet')

In [None]:
assertion_train_data.columns

## Graph setup

We will use TFGraphBuilder annotator which can be used to create graphs in the model training pipeline. 

TFGraphBuilder inspects the data and creates the proper graph if a suitable version of TensorFlow (<= 2.7 ) is available. The graph is stored in the defined folder and loaded by the approach.

In [None]:
graph_folder= "./tf_graphs"

In [None]:
assertion_graph_builder =  finance.TFGraphBuilder()\
    .setModelName("assertion_dl")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("label")\
    .setGraphFolder(graph_folder)\
    .setGraphFile("assertion_graph.pb")\
    .setMaxSequenceLength(1200)\
    .setHiddenUnitsNumber(25)

**Setting the Scope Window (Target Area) Dynamically in Assertion Status Detection Models**


This parameter allows you to train the Assertion Status Models to focus on specific context windows when resolving the status of a NER chunk. The window is in format `[X,Y]` being `X` the number of tokens to consider on the left of the chunk, and `Y` the max number of tokens to consider on the right. Let’s take a look at what different windows mean:


*   By default, the window is `[-1,-1]` which means that the Assertion Status will look at all of the tokens in the sentence/document (up to a maximum of tokens set in `setMaxSentLen()` ).
*   `[0,0]` means “don’t pay attention to any token except the ner_chunk”, what basically is not considering any context for the Assertion resolution.
*   `[9,15]` is what empirically seems to be the best baseline, meaning that we look up to 9 tokens on the left and 15 on the right of the ner chunk to understand the context and resolve the status.


Check this [Scope Window Tuning Assertion Status Detection notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/2.1.Scope_window_tuning_assertion_status_detection.ipynb)  that illustrates the effect of the different windows and how to properly fine-tune your AssertionDLModels to get the best of them.

In our case, the best Scope Window is around [10,10]

In [None]:
scope_window = [50, 50]

assertionStatus = finance.AssertionDLApproach()\
    .setLabelCol("label")\
    .setInputCols("document", "doc_chunk", "embeddings")\
    .setOutputCol("assertion")\
    .setBatchSize(128)\
    .setLearningRate(0.001)\
    .setEpochs(2)\
    .setStartCol("start")\
    .setEndCol("end")\
    .setMaxSentLen(1200)\
    .setEnableOutputLogs(True)\
    .setOutputLogsPath('training_logs/')\
    .setGraphFolder(graph_folder)\
    .setGraphFile(f"{graph_folder}/assertion_graph.pb")\
    .setTestDataset(path="test_data.parquet", read_as='SPARK', options={'format': 'parquet'})\
    .setScopeWindow(scope_window)
    #.setValidationSplit(0.2)\    
    #.setDropout(0.1)\    

In [None]:
assertion_pipeline = nlp.Pipeline(
    stages = [
    #document,
    #chunk,
    #token,
    #embeddings,
    assertion_graph_builder,
    assertionStatus])

In [None]:
training_data.printSchema()

In [None]:
assertion_train_data = spark.read.parquet('train_data.parquet')

In [None]:
%%time
assertion_model = assertion_pipeline.fit(assertion_train_data)

Checking the results saved in the log file

In [None]:
log_files = os.listdir("training_logs")
log_files

In [None]:
with open("training_logs/"+log_files[0]) as log_file:
    print(log_file.read())

In [None]:
assertion_test_data = spark.read.parquet('test_data.parquet')

In [None]:
preds = assertion_model.transform(assertion_test_data).select('label','assertion.result')


In [None]:
preds.show()

In [None]:
preds_df = preds.toPandas()

In [None]:
preds_df["result"] = preds_df["result"].apply(lambda x: x[0] if len(x) else pd.NA)
preds_df.dropna(inplace=True)

preds_df

In [None]:
# We are going to use sklearn to evalute the results on test dataset
from sklearn.metrics import classification_report

print (classification_report( preds_df['label'], preds_df['result']))

## Saving the trained model

In [None]:
assertion_model.stages

In [None]:
# Save a Spark NLP model
assertion_model.stages[-1].write().overwrite().save('Assertion')

import shutil
shutil.make_archive('Assertion', 'zip', 'Assertion')