# Financial Solution Accelerator: Drawing a Company Ecosystem
This accelerator will help you process Financial Annual Reports (10K filings) or even Wikipedia data about companies, using John Snow Labs Finance NLP, to extract the following information about companies:
- Information about the Company itself (`Trading Symbol`, `State`, `Address`, Contact Information) and other names the Company is known by (`alias`, `former name`).
- Other Companies mentioned in the report as `competitors`: we will also run a "Competitor check", to understand if another company is just in the ecosystem / supply chain of the company or it is really a competitor
- People (usually management and C-level) working in that company and their past experiences, including roles and companies
- `Acquisitions` events, including the acquisition dates. `Subsidiaries` mentioned.
- Temporality (`past`, `present`, `future`) and Certainty (`possible`) of events described, including `Forward-looking statements`.

The final aim of this accelerator is to help you analyze companies information...

<img src="https://github.com/JohnSnowLabs/spark-nlp-workshop/raw/master/tutorials/Certification_Trainings_JSL/Finance/data/im1.png" alt="drawing" width="600"/>

... create a graph...

<img src="https://github.com/JohnSnowLabs/spark-nlp-workshop/raw/master/tutorials/Certification_Trainings_JSL/Finance/data/img6.png" alt="drawing" width="400"/>

... and even being able to run Graph Embeddings on top of the graph you extract (for example, to infer new relations to green nodes given the grey ones in the picture);

<img src="https://github.com/JohnSnowLabs/spark-nlp-workshop/raw/master/tutorials/Certification_Trainings_JSL/Finance/data/im4.png" alt="drawing" width="400"/>

# Get Started with Databricks Partner Connect with John Snow Labs
John Snow Labs Spark Finance NLP Library, integrated in Databricks.
Ask for your license [here](https://docs.databricks.com/integrations/ml/john-snow-labs.html)

# Starting a session

In Databricks you will already have a Spark session started for you. 

If it's not the case, you only need to do:
`jsl.start(license_json_path=[your_path_to_json_license])`

#Imports

In [0]:
from johnsnowlabs import * 

In [0]:
import os
import sys
import time
import json
import neo4j
import functools 
import graphistry
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import spatial
from neo4j import GraphDatabase
from neo4j import GraphDatabase, Driver

print('neo4j', neo4j.__version__)
print('graphistry', graphistry.__version__)

### Neo4j abd pygraphistry aux functions

In [0]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response


In [0]:
def update_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.
    # Especially for large datasets.
    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)

    return result

In [0]:
def add_ners_rels(rows, batch_size=10000):
    # Adds NER nodes to the Neo4j graph as a batch job.
    query = '''
    //chunk1 NERs
    UNWIND $rows as row
    MERGE(n1:NER{name:row.chunk1}) ON CREATE SET n1.type=row.entity1
    
    //chunk2 NERs
    MERGE(n2:NER{name:row.chunk2}) ON CREATE SET n2.type=row.entity2

    //connect NERs
    WITH row, n1, n2
    MERGE (n1)-[:LINKS{relation:row.relation}]->(n2)

    WITH n1
    MATCH (n1)
    RETURN count(*) as total  
    '''

    return update_data(query, rows, batch_size)

In [0]:
uri = 'bolt://44.212.39.47:7687'
pwd = 'november-totals-staplers'
user= 'neo4j'

conn = Neo4jConnection(uri=uri, user=user , pwd=pwd)

# Relation Extraction Pipeline

### Create Generic Base Pipeline

In [0]:
def generic_base_pipeline():
    document_assembler = nlp.DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

    sentence_detector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","en")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

    tokenizer = nlp.Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

    embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
        .setInputCols(["sentence", "token"])\
        .setOutputCol("embeddings")
    
    base_pipeline = Pipeline(stages=[
        document_assembler,
        sentence_detector,
        tokenizer,
        embeddings
    ])
    
    return base_pipeline
    

In [0]:
ner_model_role = finance.NerModel.pretrained("finner_org_per_role_date", "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner_role")

ner_converter_role = nlp.NerConverter()\
    .setInputCols(["sentence","token","ner_role"])\
    .setOutputCol("ner_chunk_role")

ner_model_alias = finance.NerModel.pretrained("finner_orgs_prods_alias","en","finance/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner_alias")

ner_converter_alias = nlp.NerConverter()\
    .setInputCols(["sentence","token","ner_alias"])\
    .setOutputCol("ner_chunk_alias")\

ner_model_ticker = finance.NerModel.pretrained("finner_ticker", "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner_ticker")\

ner_converter_ticker = nlp.NerConverter()\
    .setInputCols(["sentence", "token", "ner_ticker"])\
    .setOutputCol("ner_chunk_ticker")

chunk_merger = finance.ChunkMergeApproach()\
    .setInputCols("ner_chunk_alias", "ner_chunk_role", "ner_chunk_ticker")\
    .setOutputCol('ner_chunk')\
    .setMergeOverlapping(True)\

pos = PerceptronModel.pretrained()\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("pos")

dependency_parser = DependencyParserModel().pretrained("dependency_conllu", "en")\
    .setInputCols(["sentence", "pos", "token"])\
    .setOutputCol("dependencies")

re_ner_chunk_filter_ticker = finance.RENerChunksFilter()\
    .setInputCols(["ner_chunk", "dependencies"])\
    .setOutputCol("re_ner_chunk_ticker")\
    .setRelationPairs(["ORG-TICKER"])\
    .setMaxSyntacticDistance(5)

re_ner_chunk_filter_alias = finance.RENerChunksFilter()\
    .setInputCols(["ner_chunk", "dependencies"])\
    .setOutputCol("re_ner_chunk_alias")\
    .setRelationPairs(["ORG-ALIAS"])\
    .setMaxSyntacticDistance(5)

re_ner_chunk_filter_role = finance.RENerChunksFilter()\
    .setInputCols(["ner_chunk", "dependencies"])\
    .setOutputCol("re_ner_chunk_role")\
    .setRelationPairs(["PERSON-ROLE", "ORG-ROLE", "DATE-ROLE"])\
    .setMaxSyntacticDistance(5)

re_ner_chunk_filter_acq = finance.RENerChunksFilter()\
    .setInputCols(["ner_chunk", "dependencies"])\
    .setOutputCol("re_ner_chunk_acq")\
    .setRelationPairs(["DATE-ORG", "DATE-ALIAS", "DATE-PRODUCT", "ORG-ORG"])\
    .setMaxSyntacticDistance(5)

re_model_exp = finance.RelationExtractionDLModel.pretrained("finre_work_experience", "en", "finance/models")\
    .setInputCols(["re_ner_chunk_role", "sentence"])\
    .setOutputCol("relations_role")\
    .setPredictionThreshold(0.5)

re_model_ticker = finance.RelationExtractionDLModel.pretrained("finre_has_ticker", "en", "finance/models")\
    .setInputCols(["re_ner_chunk_ticker", "sentence"])\
    .setOutputCol("relations_ticker")\
    .setPredictionThreshold(0.2)

re_model_acq = finance.RelationExtractionDLModel.pretrained("finre_acquisitions_subsidiaries", "en", "finance/models")\
    .setInputCols(["re_ner_chunk_acq", "sentence"])\
    .setOutputCol("relations_acq")\
    .setPredictionThreshold(0.5)

re_model_alias = finance.RelationExtractionDLModel().pretrained("finre_org_prod_alias", "en", "finance/models")\
    .setPredictionThreshold(0.5)\
    .setInputCols(["re_ner_chunk_alias", "sentence"])\
    .setOutputCol("relations_alias")

annotation_merger = finance.AnnotationMerger()\
    .setInputCols("relations_alias", "relations_role", "relations_acq", "relations_ticker")\
    .setInputType("ner_chunk")\
    .setOutputCol("relations")

nlpPipeline = Pipeline(stages=[
    generic_base_pipeline(),
    ner_model_role,
    ner_converter_role,
    ner_model_alias,
    ner_converter_alias,
    ner_model_ticker,
    ner_converter_ticker,
    chunk_merger,
    pos,
    dependency_parser,
    re_ner_chunk_filter_ticker,
    re_ner_chunk_filter_alias,
    re_ner_chunk_filter_role,
    re_ner_chunk_filter_acq,
    re_model_exp,
    re_model_ticker,
    re_model_acq,
    re_model_alias,
    annotation_merger
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)


### Get Relations As a Dataframe

In [0]:
import pandas as pd

def get_relations_df (results, col='relations'):
    rel_pairs=[]
    for rel in results[0][col]:
        rel_pairs.append((
          rel.result, 
          rel.metadata['entity1'], 
          rel.metadata['entity1_begin'],
          rel.metadata['entity1_end'],
          rel.metadata['chunk1'], 
          rel.metadata['entity2'],
          rel.metadata['entity2_begin'],
          rel.metadata['entity2_end'],
          rel.metadata['chunk2'], 
          rel.metadata['confidence']
      ))

    rel_df = pd.DataFrame(rel_pairs, columns=['relation','entity1','entity1_begin','entity1_end','chunk1','entity2','entity2_begin','entity2_end','chunk2', 'confidence'])

    return rel_df

### Sample Texts

In [0]:
sample_text = ["""On January 15, 2020, Cadence acquired all of the outstanding equity of AWR Corporation ("AWR"). On February 6, 2020, Cadence also acquired all of the outstanding equity of Integrand Software Inc.""",
               
"""Davin W. Cushman has been a director since November 2018. Mr. Cushman currently serves as CEO of Brightrose Software, a private, acquisition-focused growth company launched in 2021 from Cushman Management Company.""",
              
"""MTH - Meritage Homes Corporation reports disappointing revenue. RECN - Resources Connection Inc. shareholder Raymond James Trust has decreased holding."""]
            

### Get Result

In [0]:
light_model = LightPipeline(model)

rel_df = pd.DataFrame()

for i in range(len(sample_text)):
    result = light_model.fullAnnotate(sample_text[i])
    rel_df = pd.concat([rel_df,get_relations_df(result)],axis = 0,ignore_index=True)

rel_df[rel_df["relation"] != "no_rel"]

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence
0,has_alias,ORG,71,85,AWR Corporation,ALIAS,89,91,AWR,0.94383454
1,was_acquired,DATE,3,18,"January 15, 2020",ORG,21,27,Cadence,0.9389733
2,was_acquired,DATE,99,114,"February 6, 2020",ORG,117,123,Cadence,0.9178721
3,was_acquired,DATE,99,114,"February 6, 2020",ORG,172,193,Integrand Software Inc,0.8110341
4,was_acquired_by,ORG,117,123,Cadence,ORG,172,193,Integrand Software Inc,0.8877777
5,has_role,PERSON,0,15,Davin W. Cushman,ROLE,28,35,director,0.93061197
6,has_role_from,ROLE,28,35,director,DATE,43,55,November 2018,0.93675715
7,has_role,PERSON,58,68,Mr. Cushman,ROLE,90,92,CEO,0.75338876
8,has_role_in_company,ROLE,90,92,CEO,ORG,97,115,Brightrose Software,0.9716381
9,has_role_from,ROLE,90,92,CEO,DATE,176,179,2021,0.87006676


### Visualize Results

In [0]:
from sparknlp_display import RelationExtractionVisualizer

re_vis = viz.RelationExtractionVisualizer()

for i in range(len(sample_text)):
    result = light_model.fullAnnotate(sample_text[i])
    displayHTML(re_vis.display(result = result[0], relation_col = "relations", document_col = "document", exclude_relations = ["no_rel"], return_html=True, show_relations=True))

In [0]:
const_ners = 'CREATE CONSTRAINT ners IF NOT EXISTS ON (n:NER) ASSERT n.name IS UNIQUE'
conn.query(const_ners)

In [0]:
delete_all_nodes = 'MATCH (n) DETACH DELETE n;'
conn.query(delete_all_nodes)

In [0]:
add_ners_rels(rel_df)

![graph.png](/files/FINLEG/graph_relation.png)

# Assertion Status Pipeline for Identify COMPETITORS

In [0]:
ner = finance.NerModel.pretrained("finner_orgs_prods_alias", "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")

ner_converter = nlp.NerConverter()\
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")\
    .setWhiteList(['ORG', 'PRODUCT'])

assertion = finance.AssertionDLModel.pretrained("finassertion_competitors", "en", "finance/models")\
    .setInputCols(["sentence", "ner_chunk", "embeddings"])\
    .setOutputCol("assertion")

nlpPipeline = Pipeline(stages=[
    generic_base_pipeline(),
    ner,
    ner_converter,
    assertion
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

light_model = LightPipeline(model)

### Get Result

In [0]:
sample_text = ["""In the rapidly evolving “Ideation” market, certain elements of our application compete with Microsoft, Google, InFocus, Bluescape, Mersive, Barco, Nureva and Prysm. But, Oracle  and IBM are out of our league."""]

chunks=[]
entities=[]
status=[]


light_result = light_model.fullAnnotate(sample_text)[0]

for n,m in zip(light_result['ner_chunk'],light_result['assertion']):
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    status.append(m.result)

df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})

In [0]:
df

Unnamed: 0,chunks,entities,assertion
0,Microsoft,ORG,COMPETITOR
1,Google,ORG,COMPETITOR
2,InFocus,ORG,COMPETITOR
3,Bluescape,ORG,COMPETITOR
4,Mersive,ORG,COMPETITOR
5,Barco,ORG,COMPETITOR
6,Nureva,ORG,COMPETITOR
7,Prysm,ORG,COMPETITOR
8,Oracle,ORG,NO_COMPETITOR
9,IBM,ORG,NO_COMPETITOR


### Visualize Assertion Result

In [0]:
vis = viz.AssertionVisualizer()

vis.set_label_colors({'COMPETITOR':'#008080', 'NO_COMPETITOR':'#800080'})
    
light_result = light_model.fullAnnotate(sample_text)[0]

displayHTML(vis.display(light_result, 'ner_chunk', 'assertion', return_html=True))


In [0]:
def add_ners_assertion(rows, batch_size=10000):
    # Adds NER nodes to the Neo4j graph as a batch job.
    query = '''
    //chunk1 NERs
    UNWIND $rows as row
    MERGE(n1:NER{name:row.chunks}) ON CREATE SET n1.type=row.chunks
    
    //chunk2 NERs
    MERGE(n2:NER{name:row.entities}) ON CREATE SET n2.type=row.entities

    //connect NERs
    WITH row, n1, n2
    MERGE (n1)-[:LINKS{relation:row.assertion}]->(n2)

    WITH n1
    MATCH (n1)
    RETURN count(*) as total  
    '''

    return update_data(query, rows, batch_size)

In [0]:
delete_all_nodes = 'MATCH (n) DETACH DELETE n;'
conn.query(delete_all_nodes)

In [0]:
const_ners = 'CREATE CONSTRAINT ners IF NOT EXISTS ON (n:NER) ASSERT n.name IS UNIQUE'
conn.query(const_ners)

In [0]:
add_ners_assertion(df)

![graph.png](/files/FINLEG/graph_assertion1.png)

# Assertion Status Pipeline for Temporality/Certainty

In [0]:
ner_model_role = finance.NerModel.pretrained("finner_org_per_role_date", "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner_role")

ner_converter_role = nlp.NerConverter()\
    .setInputCols(["sentence","token","ner_role"])\
    .setOutputCol("ner_chunk_role")

ner_model_alias = finance.NerModel.pretrained("finner_orgs_prods_alias","en","finance/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner_alias")

ner_converter_alias = nlp.NerConverter()\
    .setInputCols(["sentence","token","ner_alias"])\
    .setOutputCol("ner_chunk_alias")\

chunk_merger = finance.ChunkMergeApproach()\
    .setInputCols("ner_chunk_alias", "ner_chunk_role")\
    .setOutputCol('ner_chunk')\
    .setMergeOverlapping(True)\

assertion = finance.AssertionDLModel.pretrained("finassertion_time", "en", "finance/models")\
    .setInputCols(["sentence", "ner_chunk", "embeddings"]) \
    .setOutputCol("assertion")\
    .setMaxSentLen(1200)

pipeline = Pipeline(stages=[
    generic_base_pipeline(),
    ner_model_role,
    ner_converter_role,
    ner_model_alias,
    ner_converter_alias,
    chunk_merger,
    assertion
])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = pipeline.fit(empty_data)

light_model = LightPipeline(model)

### Get Result

In [0]:
sample_text = ["""Sinopharm Group Co., Ltd. is a Chinese pharmaceutical company. The parent company of Sinopharm Group was Sinopharm Industrial Investment, a 51–49 joint venture of state-owned enterprise China National Pharmaceutical Group and civilian-run enterprise Fosun Pharmaceutical. Mr. Liu joined the Group since July 1992. He worked at Shanghai Pharmaceutical Station, China National Pharmaceutical Group Shanghai Co., Ltd., Shanghai Guoda Drug Chain Store Co., Ltd. and Sinopharm Holding Shenyang Co., Ltd.. Mr. Liu held senior management positions in the Company since January 2009, and is currently an executive Director, president and deputy secretary of Party Committee of the Company. Mr. Liu currently serves as the director of Sinopharm Investment and Sinopharm (CNMC LTD), the director of Sinopharm Accord, and also takes senior management positions in a number of subsidiaries. Sinopharm Group's subsidiary Sinopharm CNMC and Sinopharm Accord served as the A share counterpart of the company. However, the A share of Sinopharm Group itself was unlisted. COVID-19 Vaccine development the Sinopharm BIBP COVID-19 vaccine, also known as BBIBP-CorV, is one of two inactivated virus COVID-19 vaccines developed by Sinopharm. Peer-reviewed results published in JAMA of Phase III trials in United Arab Emirates and Bahrain showed that the BIBP-vaccine is 78.1% effective against symptomatic cases and 100% against severe cases (21 cases in vaccinated group vs. 95 cases in placebo group). In December 2020, the UAE previously announced interim results showing 86% efficacy. BIBP-CorV could be transported and stored at normal refrigerated temperatures. The BIBP-vaccine is being used in vaccination campaigns by certain countries in Asia, Africa, South America, and Europe. Sinopharm expects to produce one billion doses of the BIBP-vaccine in 2021. On 7 May 2021, the World Health Organization approved the vaccine for use in COVAX."""]

chunks=[]               
entities=[]
status=[]

light_result = light_model.fullAnnotate(sample_text)[0]

for n,m in zip(light_result['ner_chunk'], light_result['assertion']):
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    status.append(m.result)

df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})

In [0]:
df

Unnamed: 0,chunks,entities,assertion
0,"Sinopharm Group Co., Ltd.",ORG,PRESENT
1,Sinopharm Group,ORG,PAST
2,Sinopharm Industrial Investment,ORG,PAST
3,China National Pharmaceutical Group,ORG,PAST
4,Fosun Pharmaceutical,ORG,PAST
5,Mr. Liu,PERSON,PAST
6,July 1992,DATE,PAST
7,Shanghai Pharmaceutical Station,ORG,PAST
8,China National Pharmaceutical Group Shanghai C...,ORG,PAST
9,"Shanghai Guoda Drug Chain Store Co., Ltd.",ORG,PAST


### Visualize Assertion Result

In [0]:
vis = viz.AssertionVisualizer()

light_result = light_model.fullAnnotate(sample_text)[0]

displayHTML(vis.display(light_result, 'ner_chunk', 'assertion', return_html=True))


In [0]:
def add_ners_assertion(rows, batch_size=10000):
    # Adds NER nodes to the Neo4j graph as a batch job.
    query = '''
    //chunk1 NERs
    UNWIND $rows as row
    MERGE(n1:NER{name:row.chunks}) ON CREATE SET n1.type=row.chunks
    
    //chunk2 NERs
    MERGE(n2:NER{name:row.entities}) ON CREATE SET n2.type=row.entities

    //connect NERs
    WITH row, n1, n2
    MERGE (n1)-[:LINKS{relation:row.assertion}]->(n2)

    WITH n1
    MATCH (n1)
    RETURN count(*) as total  
    '''
    return update_data(query, rows, batch_size)

In [0]:
delete_all_nodes = 'MATCH (n) DETACH DELETE n;'
conn.query(delete_all_nodes)

In [0]:
const_ners = 'CREATE CONSTRAINT ners IF NOT EXISTS ON (n:NER) ASSERT n.name IS UNIQUE'
conn.query(const_ners)

In [0]:
add_ners_assertion(df)

![graph_assertion](/files/FINLEG/graph_assertion2.png)