In [18]:
import regex as re
import spacy 
from spacy.tokens import Span

In [19]:
text_ = "how you doing Ma Aleem and Mr.Khan ? 'iam good' what about you Mr.Johnson i heard you flew to U.S.A for a month"

In [20]:
nlp = spacy.blank("en")
doc = nlp(text_)
print(doc.ents)

()


In [21]:
pattern = "Mr\.[A-z]+"

original_ents = list(doc.ents)
mwt_ents = []

for match in re.finditer(pattern,doc.text):
    start , end = match.span()
    print(f"start {start} and end {end} for match")
    span = doc.char_span(start,end)
    if span is not None:
        print(f"span for {span}")
        mwt_ents.append((span.start,span.end,span.text))
for ent in mwt_ents:
    print(f"ent in mwt_ents {ent}")
    start,end,name = ent
    per_ent = Span(doc,start,end,label="PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
print('\n')
for ent in doc.ents:
    print(ent.text,ent.label_)

start 27 and end 34 for match
span for Mr.Khan
start 63 and end 73 for match
span for Mr.Johnson
ent in mwt_ents (6, 8, 'Mr.Khan')
ent in mwt_ents (16, 18, 'Mr.Johnson')


Mr.Khan PERSON
Mr.Johnson PERSON


In [22]:
nlp.analyze_pipes()

{'summary': {}, 'problems': {}, 'attrs': {}}

In [27]:
from spacy.util import filter_spans
# filter_span:if there are overlapig tokens like one token from 8-10 and other from 9 -12 then the biggest token is considered
from spacy.language import Language
@Language.component("mr_ner")
def mr_ner(doc):
    pattern = "Mr\.[A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern,doc.text):
        start , end = match.span()
        span = doc.char_span(start,end)
        if span is not None:
            mwt_ents.append((span.start,span.end,span.text))
    for ent in mwt_ents:
        start,end,name = ent
        per_ent = Span(doc,start,end,label="PERSON")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return doc

In [28]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("mr_ner")

<function __main__.mr_ner(doc)>

In [29]:
doc2 = nlp2(text_)

In [30]:
for ent in doc2.ents:
    print(ent.text,ent.label_)

Mr.Khan PERSON
Mr.Johnson PERSON


In [32]:
nlp2.analyze_pipes()

{'summary': {'mr_ner': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'mr_ner': []},
 'attrs': {}}

# Applied spacy financial NER

In [39]:
import spacy
import pandas as pd

In [40]:
df = pd.read_csv('stocks.tsv',sep='\t')

In [41]:
df.head()

Unnamed: 0,Symbol,CompanyName,Industry,MarketCap
0,A,Agilent Technologies,Life Sciences Tools & Services,53.65B
1,AA,Alcoa,Metals & Mining,9.25B
2,AAC,Ares Acquisition,Shell Companies,1.22B
3,AACG,ATA Creativity Global,Diversified Consumer Services,90.35M
4,AADI,Aadi Bioscience,Pharmaceuticals,104.85M


In [42]:
symbols = df.Symbol.tolist()
companies = df.CompanyName.to_list()
print(symbols[:10])

['A', 'AA', 'AAC', 'AACG', 'AADI', 'AAIC', 'AAL', 'AAMC', 'AAME', 'AAN']


In [43]:
df2 = pd.read_csv("indexes.tsv",sep="\t")
df2.head(5)

Unnamed: 0,IndexName,IndexSymbol
0,Dow Jones Industrial Average,DJIA
1,Dow Jones Transportation Average,DJT
2,Dow Jones Utility Average Index,DJU
3,NASDAQ 100 Index (NASDAQ Calculation),NDX
4,NASDAQ Composite Index,COMP


In [44]:
indexes= df2.IndexName.to_list()
index_symbol = df2.IndexSymbol.to_list()
print(indexes[:10])

['Dow Jones Industrial Average', 'Dow Jones Transportation Average', 'Dow Jones Utility Average Index', 'NASDAQ 100 Index (NASDAQ Calculation)', 'NASDAQ Composite Index', 'NYSE Composite Index', 'S&P 500 Index', 'S&P 400 Mid Cap Index', 'S&P 100 Index', 'NASDAQ Computer Index']


In [45]:
df3 = pd.read_csv("stock_exchanges.tsv",sep="\t")
df3.head()

Unnamed: 0,BloombergExchangeCode,BloombergCompositeCode,Country,Description,ISOMIC,Google Prefix,EODcode,NumStocks
0,AF,AR,Argentina,Bolsa de Comercio de Buenos Aires,XBUE,,BA,12
1,AO,AU,Australia,National Stock Exchange of Australia,XNEC,,,1
2,AT,AU,Australia,Asx - All Markets,XASX,ASX,AU,875
3,AV,,Austria,Wiener Boerse Ag,XWBO,VIE,VI,38
4,BI,,Bahrain,Bahrain Bourse,XBAH,,,4


In [46]:
exchanges = df3.Description.to_list() + df3.Description.to_list() + df3["Google Prefix"].tolist()
print(exchanges[:5])

['Bolsa de Comercio de Buenos Aires', 'National Stock Exchange of Australia', 'Asx - All Markets', 'Wiener Boerse Ag', 'Bahrain Bourse']


In [50]:
stops = ["two"]
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
patterns = []

for symbol in symbols:
    patterns.append({"label":"STOCK","pattern":symbol})
    for l in letters:
        patterns.append({"label":"STOCK","pattern":symbol+f".{l}"})
for company in companies:
    if company not in stops:
        patterns.append({"label":"COMPANY","pattern":company})

for index in indexes:
    patterns.append({"label":"INDEX","pattern":index})
    words = index.split()
    patterns.append({"label":"INDEX","pattern":" ".join(words[:2])})
for symbol in index_symbol:
    patterns.append({"label":"SYMBOL","pattern":symbol})
    
for exc in exchanges:
    patterns.append({"label":"EXCHANGE","pattern":exc})    

patterns.append({"label":"PERCENTAGE" , 
                 "pattern":[{"LIKE_NUM":True,"OP":"+"},
                            {"ORTH":".","OP":"*"},
                            {"LIKE_NUM":True,"OP":"*"},
                            {"ORTH":"%"}]})


ruler.add_patterns(patterns)

In [53]:
patterns[-3:]

[{'label': 'EXCHANGE', 'pattern': nan},
 {'label': 'EXCHANGE', 'pattern': nan},
 {'label': 'PERCENTAGE',
  'pattern': [{'LIKE_NUM': True, 'OP': '+'},
   {'ORTH': '.', 'OP': '*'},
   {'LIKE_NUM': True, 'OP': '*'},
   {'ORTH': '%'}]}]

In [334]:
text_ = """Sept 10 (Reuters) - Wall Street's main indexes were subdued on Friday as signs of higher inflation and a drop in Apple shares following an unfavorable court ruling offset expectations of an easing in U.S.-China tensions.
Data earlier in the day showed U.S. producer prices rose solidly in August, leading to the biggest annual gain in nearly 11 years and indicating that high inflation was likely to persist as the pandemic pressures supply chains.
"Today's data on wholesale prices should be eye-opening for the Federal Reserve, as inflation pressures still don't appear to be easing and will likely continue to be felt by the consumer in the coming months," said Charlie Ripley, senior investment strategist for Allianz Investment Management.
Apple Inc (AAPL.O) fell 2.7% following a U.S. court ruling in "Fortnite" creator Epic Games' antitrust lawsuit that stroke down some of the iPhone maker's restrictions on how developers can collect payments in apps.
Apple shares were set for their worst single-day fall since May this year, weighing on the Nasdaq (.IXIC) and the S&P 500 technology sub-index (.SPLRCT), which fell 0.1%.
Sentiment also took a hit from Cleveland Federal Reserve Bank President Loretta Mester's comments that she would still like the central bank to begin tapering asset purchases this year despite the weak August jobs report. 
Investors have paid keen attention to the labor market and data hinting towards higher inflation recently for hints on a timeline for the Federal Reserve to begin tapering its massive bond-buying program.
The S&P 500 has risen around 19% so far this year on support from dovish central bank policies and re-opening optimism, but concerns over rising coronavirus infections and accelerating inflation have lately stalled its advance.
The three main U.S. indexes got some support on Friday from news of a phone call between U.S. President Joe Biden and Chinese leader Xi Jinping that was taken as a positive sign which could bring a thaw in ties between the world's two most important trading partners.
At 1:01 p.m. ET, the Dow Jones Industrial Average (.DJI) was up 12.24 points, or 0.04%, at 34,891.62, the S&P 500 (.SPX) was up 2.83 points, or 0.06%, at 4,496.11, and the Nasdaq Composite (.IXIC) was up 12.85 points, or 0.08%, at 15,261.11.
Six of the eleven S&P 500 sub-indexes gained, with energy (.SPNY), materials (.SPLRCM) and consumer discretionary stocks (.SPLRCD) rising the most.
U.S.-listed Chinese e-commerce companies Alibaba and JD.com , music streaming company Tencent Music (TME.N) and electric car maker Nio Inc (NIO.N) all gained between 0.7% and 1.4%
Grocer Kroger Co (KR.N) dropped 7.1% after it said global supply chain disruptions, freight costs, discounts and wastage would hit its profit margins.
Advancing issues outnumbered decliners by a 1.12-to-1 ratio on the NYSE and by a 1.02-to-1 ratio on the Nasdaq.
The S&P index recorded 14 new 52-week highs and three new lows, while the Nasdaq recorded 49 new highs and 38 new lows."""

In [335]:
doc = nlp(text_)
for ent in doc.ents:
    print(ent.text,ent.label_)

Apple COMPANY
Apple COMPANY
AAPL.O STOCK
2.7% PERCENTAGE
Apple COMPANY
Nasdaq COMPANY
S&P 500 INDEX
0.1% PERCENTAGE
S&P 500 INDEX
19% PERCENTAGE
ET STOCK
Dow Jones Industrial Average INDEX
0.04% PERCENTAGE
S&P 500 INDEX
0.06% PERCENTAGE
Nasdaq COMPANY
0.08% PERCENTAGE
S&P 500 INDEX
JD.com COMPANY
TME.N STOCK
NIO.N STOCK
0.7% PERCENTAGE
1.4% PERCENTAGE
Kroger COMPANY
KR.N STOCK
7.1% PERCENTAGE
NYSE EXCHANGE
Nasdaq COMPANY
Nasdaq COMPANY


In [340]:
spacy.displacy.render(doc,style="ent")