# Financial Assertion Status Model 

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Finance/6.AssertionStatus.ipynb)

## Colab Setup

In [None]:
import json
import os
from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables

os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

## Start Spark Session

In [None]:
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'])

spark

## Identify COMPETITORS in a Text with Assertion Status

This model uses Assertion Status to identify if a **PRODUCT** or an **ORG** is mentioned to be a competitor. By default, if nothing is mentioned, it returns **NO_COMPETITOR**.

In [None]:
document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line
sentence_detector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

ner_model = FinanceNerModel.pretrained("finner_orgs_prods_alias","en","finance/models")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")\

ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")\

assertion = AssertionDLModel.pretrained("finassertion_competitors", "en", "finance/models")\
    .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
    .setOutputCol("assertion")
    
pipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector,
    tokenizer,
    embeddings,
    ner_model,
    ner_converter,
    assertion
    ])

empty_df = spark.createDataFrame([[""]]).toDF("text")

model = pipeline.fit(empty_df)

light_model = LightPipeline(model)

### Getting Result 

In [None]:
sample_text = """Our competitors include the following by general category: legacy antivirus product providers, such as McAfee LLC and Broadcom Inc."""

In [None]:
data = spark.createDataFrame([[sample_text]]).toDF("text")

In [None]:
result = model.transform(data)

In [None]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata, result.assertion.result)).alias("cols"))\
      .select(F.expr("cols['1']['sentence']").alias("sent_id"),
              F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"),
              F.expr("cols['2']").alias("assertion")).show(truncate=False)

+-------+------------+---------+----------+
|sent_id|chunk       |ner_label|assertion |
+-------+------------+---------+----------+
|0      |McAfee LLC  |ORG      |COMPETITOR|
|0      |Broadcom Inc|ORG      |COMPETITOR|
+-------+------------+---------+----------+



                                                                                

### Getting Result with LightPipeline

In [None]:
light_result = light_model.fullAnnotate(sample_text)[0]

chunks=[]
entities=[]
status=[]

for n,m in zip(light_result['ner_chunk'],light_result['assertion']):
    
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    status.append(m.result)
        
df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})

df

Unnamed: 0,chunks,entities,assertion
0,McAfee LLC,ORG,COMPETITOR
1,Broadcom Inc,ORG,COMPETITOR


### Visualization of Assertion Status

In [None]:
from sparknlp_display import AssertionVisualizer

vis = AssertionVisualizer()

vis.display(light_result, 'ner_chunk', 'assertion')

## Writing a Generic Assertion + NER Function

In [None]:
from pyspark.sql.functions import monotonically_increasing_id

def get_base_pipeline(embeddings):

    documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

    sentenceDetector = SentenceDetector()\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

    tokenizer = Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

    embeddings = BertEmbeddings.pretrained(embeddings, "en") \
        .setInputCols(["sentence", "token"]) \
        .setOutputCol("embeddings")

    base_pipeline = Pipeline(stages=[
                        document_assembler,
                        sentence_detector,
                        tokenizer,
                        embeddings])

    return base_pipeline


def get_assertion (embeddings, ner_model, assertion_model):

    ner = MedicalNerModel.pretrained(ner_model, "en", "finance/models")\
        .setInputCols(["sentence", "token", "embeddings"]) \
        .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk")
    
    assertion = AssertionDLModel.pretrained(assertion_model, "en", "finance/models")\
        .setInputCols(["sentence", "ner_chunk", "embeddings"])\
        .setOutputCol("assertion")
      
    base_model = get_base_pipeline(embeddings)

    nlpPipeline = Pipeline(stages=[
        base_model,
        ner,
        ner_converter,
        assertion])

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    model = nlpPipeline.fit(empty_data)
    
    light_model = LightPipeline(model)
    
    return light_model
    

### Getting Result with LightPipeline

In [None]:
sample_text = """EDH combines our Cloudera Data Warehouse, Cloudera Operational DB, and Cloudera Data Science with our SDX technology."""

embeddings = "bert_embeddings_sec_bert_base"

ner_model = "finner_orgs_prods_alias"

assertion_model = "finassertion_competitors"

light_result = get_assertion(embeddings, ner_model, assertion_model).fullAnnotate(sample_text)[0]

chunks=[]
entities=[]
status=[]

for n,m in zip(light_result['ner_chunk'],light_result['assertion']):

    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    status.append(m.result)

df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})

2022-09-09 13:25:44.274233: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
ScatterAdd: CPU 
RealDiv: CPU 
VariableV2: CPU 
Identity: CPU 
AddV2: CPU 
Unique: CPU 
Shape: CPU 
Sqrt: CPU 
AssignSub: CPU 
Sub: CPU 
Const: CPU 
Fill: CPU 
RandomUniform: CPU 
Mul: CPU 
Assign: CPU 
NoOp: CPU 
GatherV2: CPU 
UnsortedSegmentSum: CPU 
Cast: CPU 
Stride

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]


In [None]:
df

Unnamed: 0,chunks,entities,assertion
0,EDH,ORG,NO_COMPETITOR
1,Cloudera Data Warehouse,PRODUCT,NO_COMPETITOR
2,Cloudera Operational DB,PRODUCT,NO_COMPETITOR
3,Cloudera Data Science,PRODUCT,NO_COMPETITOR
4,SDX,PRODUCT,NO_COMPETITOR


### Visualization of Assertion Status

In [None]:
vis = AssertionVisualizer()

vis.display(light_result, 'ner_chunk', 'assertion')

## Identify PAST Situations in a Text with Assertion Status

In [None]:
document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")

embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

tokenClassifier = FinanceBertForTokenClassification.pretrained("finner_bert_roles","en","finance/models")\
  .setInputCols("token", "document")\
  .setOutputCol("ner")\
  .setCaseSensitive(True)

ner_converter = NerConverterInternal() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")\
    .setWhiteList(["ROLE"])

assertion = AssertionDLModel.pretrained("finassertion_past_roles", "en", "finance/models")\
    .setInputCols(["document", "ner_chunk", "embeddings"]) \
    .setOutputCol("assertion")\
    .setMaxSentLen(1200)
    
nlpPipeline = Pipeline(stages=[
    document_assembler, 
    tokenizer,
    embeddings,
    tokenClassifier,
    ner_converter,
    assertion
    ])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

light_model = LightPipeline(model)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]


### Getting Result with LightPipeline

In [None]:
sample_texts = ["""From January 2009 to November 2017, Mr. Tan worked as the Managing Director of Cadence""",
                """Jane S. Smith works as a Computer Engineer and Product Lead at Globalize Cloud Services""",
                """Mrs. Johansson has been apointed CEO and President of Mileways""",
                """Tom Martin worked as Cadence's CTO until 2010""",
                """Mrs. Charles was before Managing Director at a big consultancy company""",
                """We are happy to announce that Mary Leigh joins Elephant as Web Designer and UX/UI Developer"""]

chunks=[]
entities=[]
status=[]

for i in range(len(sample_texts)):

    light_result = light_model.fullAnnotate(sample_texts)[i]

    for n,m in zip(light_result['ner_chunk'],light_result['assertion']):

        chunks.append(n.result)
        entities.append(n.metadata['entity']) 
        status.append(m.result)

df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})

In [None]:
df

Unnamed: 0,chunks,entities,assertion
0,Director,ROLE,PAST
1,Computer Engineer,ROLE,NO_PAST
2,Product Lead,ROLE,NO_PAST
3,CEO,ROLE,NO_PAST
4,President,ROLE,NO_PAST
5,Cadence's CTO,ROLE,PAST
6,Managing Director,ROLE,PAST
7,Web Designer,ROLE,NO_PAST
8,UX/UI Developer,ROLE,NO_PAST


### Visualization of Assertion Status

In [None]:
for i in range(len(sample_texts)):
    
    light_result = light_model.fullAnnotate(sample_texts)[i]
    
    vis = AssertionVisualizer()

    vis.display(light_result, 'ner_chunk', 'assertion')