![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Financial Graphs with Spark NLP and Neo4J

In [0]:
from johnsnowlabs import * 

In [0]:
# # Restart your runtime after running this cell.
# !pip install -q neo4j
# !pip install -q tqdm
# !pip install -q --user graphistry[all]

In [0]:
import os
import sys
import time
import json
import neo4j
import functools 
import graphistry
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import spatial
from neo4j import GraphDatabase
from neo4j import GraphDatabase, Driver

print('neo4j', neo4j.__version__)
print('graphistry', graphistry.__version__)

### Establishing a connection with Neo4j Sandbox and pygraphistry

In [0]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response


In [0]:
import neo4j
from neo4j import GraphDatabase
from neo4j import GraphDatabase, Driver
uri = 'bolt://44.194.255.107:7687'
pwd = 'friend-semiconductors-switches'
user= 'neo4j'

conn = Neo4jConnection(uri=uri, user=user , pwd=pwd)

NEO4J = {
    'uri': "bolt://44.194.255.107:7687", 
    'auth': ("neo4j", "friend-semiconductors-switches")
}
# create an account in graphistry and use the username and password here.
graphistry.register(api=3,bolt=NEO4J,username='gadde', password='GSS@5300')


# Available graphs in ChunkMappers

In [0]:
sec_results = [
    """Fig Publishing, Inc.""",
    """AWA Group LP""",
    """DatChat, Inc.""",
    """iConsumer Corp."""]
    

In [0]:
documentAssembler = nlp.DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

chunkAssembler = nlp.Doc2Chunk() \
    .setInputCols("document") \
    .setOutputCol("chunk") \
    .setIsArray(False)
#to change
CM = finance.ChunkMapperModel()\
      .pretrained("finmapper_edgar_companyname", "en", "finance/models")\
      .setInputCols(["chunk"])\
      .setOutputCol("mappings")

cm_pipeline = Pipeline(stages=[documentAssembler, chunkAssembler, CM])

In [0]:
text = [""]
test_data = spark.createDataFrame([text]).toDF("text")
fit_cm_pipeline = cm_pipeline.fit(test_data)

In [0]:
from collections import defaultdict
# import pandas as pd

dicti = defaultdict(list)

for i in sec_results:
    kk = fit_cm_pipeline.transform(spark.createDataFrame([[i]]).toDF("text"))
    cm_res = []
    cm_res.append(kk.collect())
    for r in cm_res:
        json_dict = dict()
        json_dict['mappings'] = []
        for n in r[0]['mappings']:
            json_dict['mappings'].append([str(n.annotatorType), n.begin, n.end, str(n.result), {k:v for k,v in n.metadata.items()}])
    for i in json_dict['mappings']:
        dicti[i[4]['relation']].append(i[3])
    #     print(i[3],'--->',i[4]['relation'])  
    # print('-------------------------------')

df = pd.DataFrame(dicti)

In [0]:
df

Unnamed: 0,name,sic,sic_code,irs_number,fiscal_year_end,state_location,state_incorporation,business_street,business_city,business_state,business_zip,business_phone,former_name,former_name_date,date,company_id
0,"Fig Publishing, Inc.",SERVICES-PREPACKAGED SOFTWARE [7372],7372,475336565,931,CA,DE,599 THIRD STREET,SAN FRANCISCO,CA,94107,(415) 689-5789,,,2017-01-23,1658966
1,AWA Group LP,INVESTMENT ADVICE [6282],6282,371785232,630,NC,DE,116 SOUTH FRANKLIN STREET,ROCKY MOUNT,NC,27804,952-446-6678,,,2017-01-23,1645148
2,"DatChat, Inc.",TELEGRAPH & OTHER MESSAGE COMMUNICATIONS [4822],4822,472502264,1231,NJ,NV,65 CHURCH STREET,NEW BRUNSWICK,NJ,8901,7323544768,"Dat Chat, Inc",20150722.0,2017-01-12,1648960
3,iConsumer Corp.,SERVICES-PERSONAL SERVICES [7200],7200,274286597,1231,FL,DE,19821 NW 2ND AVE SUITE 351,MIAMI GARDENS,FL,33169,8003726095,,,2017-01-18,1652350


### Generate csv for Neo4j

In [0]:
df.to_csv('/dbfs/graphs.csv',index=False)

To use the CSV in Neo4j there are a couple of ways mentioned [here](https://neo4j.com/developer/kb/import-csv-locations/). I have used the one where we upload it to google sheets and then publish it to web and then use the link that is generated in here.

![6.png](/files/FINLEG/6.png)

### Visualization using Neo4j and graphistry

In [0]:
query = '''
    // create an array of maps, using the keys() function
    LOAD CSV WITH HEADERS FROM 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSSFwFkyFQgMwb_WyW8LYC0mN-7tFgt3jsiO3uq6D1atKG05w8gxSxQUytqpl1BUTbOs3ErjdO1eeWY/pub?gid=419410490&single=true&output=csv' as row
    with row.name AS company,
        [key IN keys(row) WHERE key<>'name' | {relType:key,location:row[key]}] as relTypeLocs

    WHERE row.name = 'DatChat, Inc.'

    unwind relTypeLocs AS relTypeLoc



    merge (l:locations {name:COALESCE(relTypeLoc.location,'Unknown')})



    merge (t:tokens {name:company})



    WITH l,t,



        relTypeLoc.relType AS relType



    // use apoc to create the rels



    CALL apoc.create.relationship(t,relType,{}, l) YIELD rel



    RETURN *
    '''

In [0]:
result = conn.query(query)

result

In [0]:
g = graphistry.cypher(query)
print(g._edges.columns)
# g.plot()

g.plot()

![10.png](/files/FINLEG/10.png)

# Graph without Relation Extraction

In [0]:
document_assembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence_detector = nlp.SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setCustomBounds(["\n\n"])

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(True)\
    .setMaxSentenceLength(512)
#to change
ner_model = finance.NerModel.pretrained("finner_10k_summary","en","finance/models")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")\

ner_converter = nlp.NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    embeddings,
    ner_model,
    ner_converter   
    ])

model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text"))

In [0]:
data = spark.createDataFrame([["""ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES AND EXCHANGE ACT OF 1934
For the annual period ended January 31, 2021
or
TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from________to_______
Commission File Number: 001-38856
PAGERDUTY, INC.
(Exact name of registrant as specified in its charter)
Delaware
27-2793871
(State or other jurisdiction of
incorporation or organization)
(I.R.S. Employer
Identification Number)
600 Townsend St., Suite 200, San Francisco, CA 94103
(844) 800-3889
(Address, including zip code, and telephone number, including area code, of registrant’s principal executive offices)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class
Trading symbol(s)
Name of each exchange on which registered
Common Stock, $0.000005 par value,
PD
New York Stock Exchange"""]]).toDF("text")

result = model.transform(data)

In [0]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols")) \
               .select(F.expr("cols['0']").alias("ticker"),
                       F.expr("cols['1']['entity']").alias("label")).show(50, truncate = False)

In [0]:
# from sparknlp_display import NerVisualizer

visualiser = viz.NerVisualizer()

vis = visualiser.display(result = result.collect()[0] ,label_col = 'ner_chunk', document_col = 'document',return_html=True)

displayHTML(vis)

### Visualizing using graphistry

### Generate csv for Neo4j

In [0]:
df = result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols")) \
               .select(F.expr("cols['0']").alias("ticker"),
                       F.expr("cols['1']['entity']").alias("label")).toPandas()

In [0]:
df.to_csv('/dbfs/graphs.csv',index=False)

TO use the CSV in Neo4j there are a couple of ways mentioned [here](https://neo4j.com/developer/kb/import-csv-locations/). I have used the one where we upload it to google sheets and then publish it to web and then use the link that is generated in here.

![11.png](/files/FINLEG/7.png)

### Visualizing the Nodes and Edges

In [0]:
(graphistry
  .edges(df, 'label','ticker')
  .nodes(df,'label')
).plot() 

![12.png](/files/FINLEG/11.png)

# Graphs with NER and Relation Extraction

In [0]:
documentAssembler = nlp.DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

"""sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")"""

tokenizer = nlp.Tokenizer()\
  .setInputCols("document")\
  .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base", "en") \
        .setInputCols("document", "token") \
        .setOutputCol("embeddings")\
        .setMaxSentenceLength(512)

ner_model = finance.NerModel.pretrained("finner_orgs_prods_alias","en","finance/models")\
        .setInputCols(["document", "token", "embeddings"])\
        .setOutputCol("ner")

ner_converter = nlp.NerConverter()\
        .setInputCols(["document","token","ner"])\
        .setOutputCol("ner_chunk")

"""
ONLY NEEDED IF YOU WANT TO FILTER RELATION PAIRS OR SYNTACTIC DISTANCE
pos_tagger = PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["document", "tokens"])\
    .setOutputCol("pos_tags")

dependency_parser = DependencyParserModel() \
    .pretrained("dependency_conllu", "en") \
    .setInputCols(["document", "pos_tags", "tokens"]) \
    .setOutputCol("dependencies")

# Set a filter on pairs of named entities which will be treated as relation candidates
re_filter = RENerChunksFilter()\
    .setInputCols(["ner_chunks", "dependencies"])\
    .setOutputCol("re_ner_chunks")\
#    .setMaxSyntacticDistance(7)\
#    .setRelationPairs(['PARTY-ALIAS', 'DOC-PARTY', 'DOC-EFFDATE'])
"""
reDL = finance.RelationExtractionDLModel.pretrained("finre_acquisitions_subsidiaries", "en", "finance/models")\
    .setPredictionThreshold(0.5)\
    .setInputCols(["ner_chunk", "document"])\
    .setOutputCol("relations")

In [0]:
pipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    embeddings,
    ner_model,
    ner_converter,
    reDL
])
"""sentenceDetector,
pos_tagger,
    dependency_parser,
    re_filter,
"""

In [0]:
text='''
On January 25, 2019, the Company acquired all outstanding stock of SignifAI, Inc. ("SignifAI"), an event intelligence company specializing in artificial intelligence and machine learning.
'''

data = spark.createDataFrame([[text]]).toDF("text")
model = pipeline.fit(data)


In [0]:
import pandas as pd

def get_relations_df (results, col='relations'):
  rel_pairs=[]
  for rel in results[0][col]:
      rel_pairs.append((
          rel.result, 
          rel.metadata['entity1'], 
          rel.metadata['entity1_begin'],
          rel.metadata['entity1_end'],
          rel.metadata['chunk1'], 
          rel.metadata['entity2'],
          rel.metadata['entity2_begin'],
          rel.metadata['entity2_end'],
          rel.metadata['chunk2'], 
          rel.metadata['confidence']
      ))

  rel_df = pd.DataFrame(rel_pairs, columns=['relation','entity1','entity1_begin','entity1_end','chunk1','entity2','entity2_begin','entity2_end','chunk2', 'confidence'])

  return rel_df

In [0]:
lmodel = LightPipeline(model)
results = lmodel.fullAnnotate(text)
rel_df = get_relations_df (results)
rel_df = rel_df[rel_df['relation']!='no_rel']
print(rel_df.to_string(index=False))
print()

### Visualization using Neo4j and graphistry

In [0]:
def update_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.
    # Especially for large datasets.
    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)

    return result

In [0]:
def add_ners_rels(rows, batch_size=10000):
    # Adds NER nodes to the Neo4j graph as a batch job.
    query = '''
    //chunk1 NERs
    UNWIND $rows as row
    MERGE(n1:NER{name:row.chunk1}) ON CREATE SET n1.type=row.entity1

    //chunk2 NERs
    MERGE(n2:NER{name:row.chunk2}) ON CREATE SET n2.type=row.entity2

    //connect NERs
    WITH row, n1, n2
    MERGE (n1)-[:LINKS{relation:row.relation}]->(n2)
    RETURN count(*) as total  
    '''

    return update_data(query, rows, batch_size)

In [0]:
add_ners_rels(rel_df)

![12.png](/files/FINLEG/12.png)

In [0]:
rel_df.to_csv('/dbfs/relations.csv',index=False)

In [0]:
# import pandas as pd
matches_df = pd.read_csv('/dbfs/relations.csv')
matches_df.columns


In [0]:
(graphistry
  .edges(matches_df, 'chunk1',
        'chunk2')
  .nodes(matches_df, 'relation','entity1','entity2')
).plot() 