# Named Entity Recognition

- Using Flair
- https://medium.com/@sapphireduffy/is-flair-a-suitable-alternative-to-spacy-6f55192bfb01


# 1)- Importing key modules

In [1]:
from __future__ import unicode_literals

In [2]:
import pandas as pd

In [3]:
from flair.models import SequenceTagger
model = SequenceTagger.load('ner-ontonotes-fast') #.load('ner')
from flair.data import Sentence

2020-03-15 11:36:32,505 loading file /Users/hassansherwani/.flair/models/en-ner-ontonotes-fast-v0.4.pt


# 2)- Loading Sample text 

In [4]:
text="The Gabba remained unbreached as Australia wrapped up a win within four days, kickstarting their summer on a bright note.Pakistan, who made Australia work hard for that win on Sunday (November 24), were bowled out eventually for 355, conceding a win by an innings and 5 runs in Brisbane.This was Pakistan's fifth consecutive loss away from home, while Australia maintained their unbeaten record at the Gabba, not having lost a Test here since 1988."

In [5]:
text

"The Gabba remained unbreached as Australia wrapped up a win within four days, kickstarting their summer on a bright note.Pakistan, who made Australia work hard for that win on Sunday (November 24), were bowled out eventually for 355, conceding a win by an innings and 5 runs in Brisbane.This was Pakistan's fifth consecutive loss away from home, while Australia maintained their unbeaten record at the Gabba, not having lost a Test here since 1988."

In [6]:
type(text)

str

# 3)- Convert to flair data type

In [7]:
s = Sentence(text)
type(s)

flair.data.Sentence

# 4)- Make Prediction

In [8]:
model.predict(s)

[Sentence: "The Gabba remained unbreached as Australia wrapped up a win within four days, kickstarting their summer on a bright note.Pakistan, who made Australia work hard for that win on Sunday (November 24), were bowled out eventually for 355, conceding a win by an innings and 5 runs in Brisbane.This was Pakistan's fifth consecutive loss away from home, while Australia maintained their unbeaten record at the Gabba, not having lost a Test here since 1988." - 74 Tokens]

# 5)- Entity Extraction

In [9]:
s.to_dict(tag_type='ner')

{'text': "The Gabba remained unbreached as Australia wrapped up a win within four days, kickstarting their summer on a bright note.Pakistan, who made Australia work hard for that win on Sunday (November 24), were bowled out eventually for 355, conceding a win by an innings and 5 runs in Brisbane.This was Pakistan's fifth consecutive loss away from home, while Australia maintained their unbeaten record at the Gabba, not having lost a Test here since 1988.",
 'labels': [],
 'entities': [{'text': 'Gabba',
   'start_pos': 4,
   'end_pos': 9,
   'type': 'PERSON',
   'confidence': 0.9650894999504089},
  {'text': 'Australia',
   'start_pos': 33,
   'end_pos': 42,
   'type': 'GPE',
   'confidence': 0.9948590993881226},
  {'text': 'four',
   'start_pos': 67,
   'end_pos': 71,
   'type': 'CARDINAL',
   'confidence': 0.7777217626571655},
  {'text': 'summer',
   'start_pos': 97,
   'end_pos': 103,
   'type': 'DATE',
   'confidence': 0.9377575516700745},
  {'text': 'Australia',
   'start_pos': 140,

# 6)-Showing as a table

In [10]:
data=s.to_dict(tag_type='ner')

In [11]:
given_text=data["text"]

In [12]:
entity=data["entities"]

In [13]:
for type in entity:
    print(type)

{'text': 'Gabba', 'start_pos': 4, 'end_pos': 9, 'type': 'PERSON', 'confidence': 0.9650894999504089}
{'text': 'Australia', 'start_pos': 33, 'end_pos': 42, 'type': 'GPE', 'confidence': 0.9948590993881226}
{'text': 'four', 'start_pos': 67, 'end_pos': 71, 'type': 'CARDINAL', 'confidence': 0.7777217626571655}
{'text': 'summer', 'start_pos': 97, 'end_pos': 103, 'type': 'DATE', 'confidence': 0.9377575516700745}
{'text': 'Australia', 'start_pos': 140, 'end_pos': 149, 'type': 'GPE', 'confidence': 0.9992710947990417}
{'text': 'Sunday (November 24),', 'start_pos': 176, 'end_pos': 197, 'type': 'DATE', 'confidence': 0.7761636177698771}
{'text': '355,', 'start_pos': 229, 'end_pos': 233, 'type': 'CARDINAL', 'confidence': 0.45639532804489136}
{'text': '5', 'start_pos': 268, 'end_pos': 269, 'type': 'CARDINAL', 'confidence': 0.9856798648834229}
{'text': "Pakistan's", 'start_pos': 296, 'end_pos': 306, 'type': 'WORK_OF_ART', 'confidence': 0.5797803401947021}
{'text': 'fifth', 'start_pos': 307, 'end_pos': 

In [14]:
pd.DataFrame(entity, columns=["text","type","confidence"])

Unnamed: 0,text,type,confidence
0,Gabba,PERSON,0.965089
1,Australia,GPE,0.994859
2,four,CARDINAL,0.777722
3,summer,DATE,0.937758
4,Australia,GPE,0.999271
5,"Sunday (November 24),",DATE,0.776164
6,355,CARDINAL,0.456395
7,5,CARDINAL,0.98568
8,Pakistan's,WORK_OF_ART,0.57978
9,fifth,ORDINAL,0.999485


# 7)- Submitting results as json image

In [15]:
df_ent=pd.DataFrame(entity, columns=["text","type","confidence"])

In [16]:
df_ent.to_json('entity_extraction.json')