# Financial Data Augmentation with Chunk Mappers

# Colab Setup

In [2]:
import sys
import json
import os
with open('../../spark_nlp_for_healthcare_spark_ocr_4.0.2.json') as f:
    license_keys = json.load(f)
    
import os
locals().update(license_keys)
os.environ.update(license_keys)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
import sparknlp
import sparknlp_jsl

print("sparknlp version:",sparknlp.version())
print("sparknlp_jsl version:", sparknlp_jsl.version())

sparknlp version: 4.0.2
sparknlp_jsl version: 4.0.2


# Start Spark Session

In [4]:
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'])

spark

Spark NLP Version : 4.0.2
Spark NLP_JSL Version : 4.0.2
:: loading settings :: url = jar:file:/home/jovyan/work/persistent/.local/share/jupyter/venvs/spanishdeid/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ef109a49-aa02-45fb-a4cd-6f0a75765576;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.0.2 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lom

# About Data Augmentation

__Data Augmentation__ is the process of increase an extracted datapoint with external sources. 

For example, let's suppose I work with a document which mentions the company _Amazon_. We could be talking about stock prices, or some legal litigations, or just a commercial agreement with a provider, among others.

In the document, we can extract `Amazon` using NER as an Organization, but that's all the information available about `Amazon` in that document.

Well, with __Data Augmentation__, we can use external sources, as _SEC Edgar, Crunchbase, Nasdaq_ or even _Wikipedia_, to enrich `Amazon` with much more information, allowing us to take better decisions.

Let's see how to do it.

# Step 1: Name Entity Recognition

Let's suppose we get this news from scrapping the Internet, or from Twitter.

In [24]:
text = "We have entered into a definitive merger agreement with Amazon."

We use NER to extract the companies name, in this case, Amazon.

In [25]:
documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")
        
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

tokenizer = Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

ner_model = LegalNerModel.pretrained("legner_orgs_prods_alias", "en", "legal/models")\
        .setInputCols(["sentence", "token", "embeddings"])\
        .setOutputCol("ner")
        
ner_converter = NerConverter()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        ner_model,
        ner_converter,
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]


## We use LightPipelines to get the result

In [26]:
lp_ner = LightPipeline(model)

In [27]:
ner_result = lp_ner.annotate(text)
ner_result

{'document': ['We have entered into a definitive merger agreement with Amazon.'],
 'ner_chunk': ['Amazon'],
 'token': ['We',
  'have',
  'entered',
  'into',
  'a',
  'definitive',
  'merger',
  'agreement',
  'with',
  'Amazon',
  '.'],
 'ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O'],
 'embeddings': ['We',
  'have',
  'entered',
  'into',
  'a',
  'definitive',
  'merger',
  'agreement',
  'with',
  'Amazon',
  '.'],
 'sentence': ['We have entered into a definitive merger agreement with Amazon.']}

Alright! Amazon has been detected as an organization. 

Now, let's augment `Amazon` with more information about the company, given that there are no more details in the tweet I can use.

But before __augmenting__, there is a very important step we need to carry out: `Company Name Normalization`

# Step 2: Company Names Normalization

Let's suppose we want to manually get information about Amazon.

Since it's a public US company, we can go to [SEC Edgar's database](https://www.sec.gov/edgar/searchedgar/companysearch) and look for it.

Unfortunately, `Amazon` is not the official name of the company, which means no entry for `Amazon` is available. That's were __Company Names Normalization__ comes in handy.

`Company Name Normalization` is the process of obtaining the name of the company used by data providers, usually the "official" name of the company.

Sometimes, some data providers may have different versions of the name with different punctuation. For example, for Meta:
- Meta Platforms, Inc.
- Meta Platforms Inc.
- Meta Platforms, Inc
- etc

So, it's mandatory we do `Company Normalization` taking into account the database / datasource provider we want to extract data from. The data providers we have are:
- SEC Edgar
- Crunchbase until 2015
- Wikidata (in progress)

Let's normalize `Amazon` to the official name in _SEC Edgar_.

In [28]:
embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")
    
resolver = SentenceEntityResolverModel.pretrained("legel_edgar_company_name", "en", "legal/models")\
      .setInputCols(["text", "sentence_embeddings"]) \
      .setOutputCol("resolution")\
      .setDistanceFunction("EUCLIDEAN")

pipelineModel = PipelineModel(
      stages = [
          documentAssembler,
          embeddings,
          resolver])

lp_res = LightPipeline(pipelineModel)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [29]:
ner_result['ner_chunk']

['Amazon']

In [30]:
el_res = lp_res.annotate(ner_result['ner_chunk'])
el_res

[{'document': ['Amazon'],
  'sentence_embeddings': ['Amazon'],
  'resolution': ['AMAZON COM INC']}]

Here is our normalized name for Amazon: `AMAZON COM INC`.

Now, let's see which information is available in Edgar database for `AMAZON COM INC`

# Steps 1 and 2 in the same pipeline

In [31]:
documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")
        
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

tokenizer = Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

ner_model = FinanceNerModel.pretrained("legner_orgs_prods_alias", "en", "legal/models")\
        .setInputCols(["sentence", "token", "embeddings"])\
        .setOutputCol("ner")
        
ner_converter = NerConverter()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")

chunk2doc = Chunk2Doc()\
        .setInputCols("ner_chunk")\
        .setOutputCol("ner_chunk_doc")

sentence_embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
      .setInputCols("ner_chunk_doc") \
      .setOutputCol("sentence_embeddings")
    
resolver = SentenceEntityResolverModel.pretrained("legel_edgar_company_name", "en", "legal/models")\
      .setInputCols(["text", "sentence_embeddings"]) \
      .setOutputCol("resolution")\
      .setDistanceFunction("EUCLIDEAN")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        ner_model,
        ner_converter,
        chunk2doc,
        sentence_embeddings,
        resolver
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [32]:
lp_model = LightPipeline(model)

In [33]:
el_res = lp_model.annotate(text)
el_res

{'document': ['We have entered into a definitive merger agreement with Amazon.'],
 'ner_chunk': ['Amazon'],
 'sentence_embeddings': ['Amazon'],
 'resolution': ['AMAZON COM INC'],
 'token': ['We',
  'have',
  'entered',
  'into',
  'a',
  'definitive',
  'merger',
  'agreement',
  'with',
  'Amazon',
  '.'],
 'ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O'],
 'embeddings': ['We',
  'have',
  'entered',
  'into',
  'a',
  'definitive',
  'merger',
  'agreement',
  'with',
  'Amazon',
  '.'],
 'ner_chunk_doc': ['Amazon'],
 'sentence': ['We have entered into a definitive merger agreement with Amazon.']}

# Step 3: Data Augmentation with Chunk Mappers

The component which carries out __Data Augmentation__ is called `ChunkMapper`.

It's name comes from the way it works: it uses a _Ner Chunk_ to map it to an external data source.

As a result, you will get a JSON with a dictionary of additional fields and their values. 

Let's take a look at how it works.

In [34]:
chunkAssembler = Doc2Chunk() \
    .setInputCols("document") \
    .setOutputCol("chunk") \
    .setIsArray(False)

CM = ChunkMapperModel().pretrained("legmapper_edgar_companyname", "en", "legal/models")\
      .setInputCols(["chunk"])\
      .setOutputCol("mappings")

cm_pipeline = Pipeline(stages=[documentAssembler, chunkAssembler, CM])
fit_cm_pipeline = cm_pipeline.fit(empty_data)

                                                                                

In [35]:
# LightPipelines don't support Doc2Chunk, so we will use here usual transform

df = spark.createDataFrame([el_res['resolution']]).toDF("text")
df.show()

+--------------+
|          text|
+--------------+
|AMAZON COM INC|
+--------------+



                                                                                

In [36]:
res = fit_cm_pipeline.transform(df)
res.show()

+--------------+--------------------+--------------------+--------------------+
|          text|            document|               chunk|            mappings|
+--------------+--------------------+--------------------+--------------------+
|AMAZON COM INC|[{document, 0, 13...|[{chunk, 0, 13, A...|[{labeled_depende...|
+--------------+--------------------+--------------------+--------------------+



In [37]:
r = res.collect()
r

[Row(text='AMAZON COM INC', document=[Row(annotatorType='document', begin=0, end=13, result='AMAZON COM INC', metadata={'sentence': '0'}, embeddings=[])], chunk=[Row(annotatorType='chunk', begin=0, end=13, result='AMAZON COM INC', metadata={'sentence': '0', 'chunk': '0'}, embeddings=[])], mappings=[Row(annotatorType='labeled_dependency', begin=0, end=13, result='AMAZON COM INC', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMAZON COM INC', 'relation': 'name', 'all_relations': ''}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=13, result='RETAIL-CATALOG & MAIL-ORDER HOUSES [5961]', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMAZON COM INC', 'relation': 'sic', 'all_relations': '[5961'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0, end=13, result='5961', metadata={'sentence': '0', 'chunk': '0', 'entity': 'AMAZON COM INC', 'relation': 'sic_code', 'all_relations': '0'}, embeddings=[]), Row(annotatorType='labeled_dependency', begin=0

In [38]:
json_dict = dict()
for n in r[0]['mappings']:
    json_dict[n.metadata['relation']] = str(n.result)

In [39]:
import json
print(json.dumps(json_dict, indent=4, sort_keys=True))

{
    "business_city": "SEATTLE",
    "business_phone": "2062661000",
    "business_state": "WA",
    "business_street": "410 TERRY AVENUE NORTH",
    "business_zip": "98109",
    "company_id": "1018724",
    "date": "2017-02-10",
    "fiscal_year_end": "1231",
    "former_name": "ABX Holdings, Inc.",
    "former_name_date": "20080102",
    "irs_number": "911646860",
    "name": "AMAZON COM INC",
    "sic": "RETAIL-CATALOG & MAIL-ORDER HOUSES [5961]",
    "sic_code": "5961",
    "state_incorporation": "DE",
    "state_location": "WA"
}
