# Named Entity Recognition; Working Codes

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import matplotlib
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\fross\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [4]:
nltk.help.upenn_tagset('RB')
nltk.help.upenn_tagset('NN')
nltk.help.upenn_tagset('VB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


In [5]:
from nltk.corpus import brown
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger
from nltk.tag import RegexpTagger

In [6]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags)

FreqDist({'NN': 13162, 'IN': 10616, 'AT': 8893, 'NP': 6866, ',': 5133, 'NNS': 5066, '.': 4452, 'JJ': 4392, 'CC': 2664, 'VBD': 2524, ...})

In [7]:
brown_tagged_sents = brown.tagged_sents(categories='news')
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]  # 90%
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]  # 10%
test_sent = brown.sents(categories='news')[0]
test_data[0]

[('But', 'CC'),
 ('in', 'IN'),
 ('all', 'ABN'),
 ('its', 'PP$'),
 ('175', 'CD'),
 ('years', 'NNS'),
 (',', ','),
 ('not', '*'),
 ('a', 'AT'),
 ('single', 'AP'),
 ('Negro', 'NP'),
 ('student', 'NN'),
 ('has', 'HVZ'),
 ('entered', 'VBN'),
 ('its', 'PP$'),
 ('classrooms', 'NNS'),
 ('.', '.')]

### DefaultTagger

In [8]:
default_tagger = nltk.DefaultTagger('NN')
display(default_tagger.tag(test_sent), default_tagger.evaluate(test_data))

[('The', 'NN'),
 ('Fulton', 'NN'),
 ('County', 'NN'),
 ('Grand', 'NN'),
 ('Jury', 'NN'),
 ('said', 'NN'),
 ('Friday', 'NN'),
 ('an', 'NN'),
 ('investigation', 'NN'),
 ('of', 'NN'),
 ("Atlanta's", 'NN'),
 ('recent', 'NN'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'NN'),
 ('``', 'NN'),
 ('no', 'NN'),
 ('evidence', 'NN'),
 ("''", 'NN'),
 ('that', 'NN'),
 ('any', 'NN'),
 ('irregularities', 'NN'),
 ('took', 'NN'),
 ('place', 'NN'),
 ('.', 'NN')]

0.1262832652247583

### UnigramTagger

In [9]:
unigram_tagger = UnigramTagger(train_data)
display(unigram_tagger.tag(test_sent), unigram_tagger.evaluate(test_data))

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

0.8121200039868434

### BigramTagger

In [10]:
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
display(bigram_tagger.tag(test_sent), bigram_tagger.evaluate(test_data))

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

0.8210904016744742

### TrigramTagger

In [11]:
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
display(trigram_tagger.tag(test_sent), trigram_tagger.evaluate(test_data))

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

0.8185986245390212

### RegexpTagger

In [12]:
patterns = [
    (r'.*ing$', 'VBG'),                # gerunds
    (r'.*ed$', 'VBD'),                 # simple past
    (r'.*es$', 'VBZ'),                 # 3rd singular present
    (r'.*ould$', 'MD'),                # modals
    (r'.*\'s$', 'NN$'),                # possessive nouns
    (r'.*s$', 'NNS'),                  # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN'),                      # nouns (default) 
    (r'.*ment$', 'NN'),                # i.e. wonderment 
    (r'.*ful$', 'JJ')                  # i.e. wonderful 
]
regexp_tagger = RegexpTagger(patterns)
display(regexp_tagger.tag(test_sent), regexp_tagger.evaluate(test_data))

[('The', 'NN'),
 ('Fulton', 'NN'),
 ('County', 'NN'),
 ('Grand', 'NN'),
 ('Jury', 'NN'),
 ('said', 'NN'),
 ('Friday', 'NN'),
 ('an', 'NN'),
 ('investigation', 'NN'),
 ('of', 'NN'),
 ("Atlanta's", 'NN$'),
 ('recent', 'NN'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', 'NN'),
 ('no', 'NN'),
 ('evidence', 'NN'),
 ("''", 'NN'),
 ('that', 'NN'),
 ('any', 'NN'),
 ('irregularities', 'VBZ'),
 ('took', 'NN'),
 ('place', 'NN'),
 ('.', 'NN')]

0.20253164556962025

### Combination of taggers

In [13]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


backoff = DefaultTagger('NN') 
tag = backoff_tagger(train_data,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
tag.evaluate(test_data) 

0.843317053722715

In [14]:
tag.tag(['John'])

[('John', 'NP')]

In [15]:
from nltk.tag import SequentialBackoffTagger
from nltk.corpus import names

In [16]:
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\fross\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [17]:
class NamesTagger(SequentialBackoffTagger):
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.name_set = set([n.lower() for n in names.words()])
            
    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        if word.lower() in self.name_set:
             return 'NNP'
        else:
             return None
            
nt = NamesTagger()
print(nt.tag(['Katya'])) 
print(nt.tag(['Adam'])) 
print(nt.tag(['Window']))  

[('Katya', 'NNP')]
[('Adam', 'NNP')]
[('Window', None)]


In [18]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [19]:
# nltk.download('universal_tagset')

In [20]:
nltk.corpus.brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [21]:
brown_news_tagged = nltk.corpus.brown.tagged_words(categories='adventure', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[('NOUN', 13354),
 ('VERB', 12274),
 ('.', 10929),
 ('DET', 8155),
 ('ADP', 7069),
 ('PRON', 5205),
 ('ADV', 3879),
 ('ADJ', 3364),
 ('PRT', 2436),
 ('CONJ', 2173),
 ('NUM', 466),
 ('X', 38)]

In [22]:
import requests
from bs4 import BeautifulSoup
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# document = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')

document = url_to_string('https://apnews.com/article/new-jersey-newark-city-113055436513a7ac0f1d875cd5913e01?utm_source=homepage&utm_medium=TopNews&utm_campaign=position_04')


nltk.pos_tag(nltk.word_tokenize(document))

[('FBI', 'NNP'),
 ('warns', 'NNS'),
 ('of', 'IN'),
 ("'broad", 'NN'),
 ("'", 'POS'),
 ('threat', 'NN'),
 ('to', 'TO'),
 ('synagogues', 'VB'),
 ('in', 'IN'),
 ('New', 'NNP'),
 ('Jersey', 'NNP'),
 ('|', 'NNP'),
 ('AP', 'NNP'),
 ('NewsAP', 'NNP'),
 ('NEWS', 'NNP'),
 ('ListenSectionsU.S', 'NNP'),
 ('.', '.'),
 ('NewsWorld', 'NNP'),
 ('NewsPoliticsSportsEntertainmentBusinessTechnologyHealthScienceOdditiesLifestylePhotographyVideosListenSectionsAP',
  'NNP'),
 ('Top', 'NNP'),
 ('NewsU.S', 'NNP'),
 ('.', '.'),
 ('NewsWorld', 'NNP'),
 ('NewsAfricaAsia', 'NNP'),
 ('PacificAustraliaEuropeLatin', 'NNP'),
 ('AmericaMiddle', 'NNP'),
 ('EastPoliticsPresident', 'NNP'),
 ('BidenMidterm', 'NNP'),
 ('electionsCongressSupreme', 'NN'),
 ('CourtSportsWorld', 'NNP'),
 ('SeriesWorld', 'NNP'),
 ('Cup', 'NNP'),
 ('2022NFLCollege', 'CD'),
 ('footballNBANHLEntertainmentFilm', 'NN'),
 ('ReviewsMoviesMusicTelevisionFashionBusinessU.S', 'NNP'),
 ('.', '.'),
 ('economyFinancial', 'JJ'),
 ('marketsVideosTechnologyHea

In [23]:
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

{('AP', 'ORGANIZATION'),
 ('Associated', 'ORGANIZATION'),
 ('Associated Press', 'ORGANIZATION'),
 ('CheckLifestyleReligionPress', 'ORGANIZATION'),
 ('Christian', 'GPE'),
 ('ConditionsPrivacyAll', 'ORGANIZATION'),
 ('CourtSportsWorld', 'ORGANIZATION'),
 ('David Porter', 'PERSON'),
 ('FBI', 'ORGANIZATION'),
 ('GeorgiaPrivate', 'ORGANIZATION'),
 ('GoogleKosovo', 'ORGANIZATION'),
 ('Gottheimer', 'PERSON'),
 ('HOBOKEN', 'ORGANIZATION'),
 ('Hoboken', 'GPE'),
 ('Islamist', 'ORGANIZATION'),
 ('Jersey', 'PERSON'),
 ('Jersey City', 'GPE'),
 ('Jersey City', 'PERSON'),
 ('Jersey Democrat', 'PERSON'),
 ('JerseyBy', 'ORGANIZATION'),
 ('Jewish', 'GPE'),
 ('Kanye West', 'PERSON'),
 ('Kyrie', 'PERSON'),
 ('MICHAEL', 'ORGANIZATION'),
 ('Mayor Steven Fulop', 'PERSON'),
 ('Molotov', 'GPE'),
 ('N.J.', 'GPE'),
 ('NBA', 'ORGANIZATION'),
 ('NEWSTop', 'ORGANIZATION'),
 ('New', 'GPE'),
 ('New Jersey', 'GPE'),
 ('New York City', 'GPE'),
 ('Newark', 'GPE'),
 ('NewsAP', 'ORGANIZATION'),
 ('NewsMidterm', 'ORGANIZAT

In [24]:
nltk.pos_tag(nltk.word_tokenize(document))

[('FBI', 'NNP'),
 ('warns', 'NNS'),
 ('of', 'IN'),
 ("'broad", 'NN'),
 ("'", 'POS'),
 ('threat', 'NN'),
 ('to', 'TO'),
 ('synagogues', 'VB'),
 ('in', 'IN'),
 ('New', 'NNP'),
 ('Jersey', 'NNP'),
 ('|', 'NNP'),
 ('AP', 'NNP'),
 ('NewsAP', 'NNP'),
 ('NEWS', 'NNP'),
 ('ListenSectionsU.S', 'NNP'),
 ('.', '.'),
 ('NewsWorld', 'NNP'),
 ('NewsPoliticsSportsEntertainmentBusinessTechnologyHealthScienceOdditiesLifestylePhotographyVideosListenSectionsAP',
  'NNP'),
 ('Top', 'NNP'),
 ('NewsU.S', 'NNP'),
 ('.', '.'),
 ('NewsWorld', 'NNP'),
 ('NewsAfricaAsia', 'NNP'),
 ('PacificAustraliaEuropeLatin', 'NNP'),
 ('AmericaMiddle', 'NNP'),
 ('EastPoliticsPresident', 'NNP'),
 ('BidenMidterm', 'NNP'),
 ('electionsCongressSupreme', 'NN'),
 ('CourtSportsWorld', 'NNP'),
 ('SeriesWorld', 'NNP'),
 ('Cup', 'NNP'),
 ('2022NFLCollege', 'CD'),
 ('footballNBANHLEntertainmentFilm', 'NN'),
 ('ReviewsMoviesMusicTelevisionFashionBusinessU.S', 'NNP'),
 ('.', '.'),
 ('economyFinancial', 'JJ'),
 ('marketsVideosTechnologyHea

In [25]:
document = '1 December 2021, the FARDC fought back at Goma and recaptured a position from M23 militia. The MONUSCO supplied their advances. Start was given to the operation!'

In [26]:
document = document.lower()
document

'1 december 2021, the fardc fought back at goma and recaptured a position from m23 militia. the monusco supplied their advances. start was given to the operation!'

In [27]:
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

set()

# spacy

In [28]:
# !pip install spacy
# !python -m spacy validate

In [29]:
import spacy
from spacy import displacy 
import en_core_web_sm

In [30]:
document = '1 December 2021, the FARDC fought back near Goma and recaptured a position from M23 militia. The MONUSCO supplied their advances. Start was given to the operation! As said the representative of the red cross, "kaboom!". unhcr confirmed the news. Monika allocated the assets near komanda village. floribert said that he was in the car. Sputnik news agency was on the spot. Kinshasa welcomed the delegation. In Kinshasa the delegation was welcomed by the minister. CCTV covered the incident while un people protested. Mai-Mai Yakutumba attacked Nyamilima. Mai-Mai Raia Mutomboki surrendered on Friday. UPC collected 10000 francs to participate in the elections next year. New Technoligies is going to import coltan from Katanga. The President Tchisekedi rejected their claims. Major Solana confirmed the crash of a helicopter near Kabalo. 300000 IDPs were registered in Kongolo. Twenty Twa were killed in the massacre. The conflict between Batende and Banunu tribes disrupted the elections in Mai-Ndombe province. Fifty people drowned in Ubangi river. Fifty people drowned in the Congo River last weekends. Gazprom refused to import fertolizers. CAID statistic was very important for the article of Kanakake Kagame. air france stopped all flights next week. ADF combatants are going to take part in the DDRRR process. FDLR militiamen escaped an FARDC ambush. Putin is dead. Obama wants coca-cola. France24 covered the story. Loran Nkunda escaped to Rwanda. Nyiragongo erupted in Virunga park. General Baka resigned. MM Yakutumba refused to negotiate. The PNC said it was not their business. ANR spokesperson Trafaret Kirzun announced new operation. Gallery Presidentielle Ancienne hosts the mission. Saint Petersburg was visited by Madonna. Manila under attack! Mai-Mai Malaika captured Kabambare.'

In [31]:
# document = url_to_string('https://apnews.com/article/new-jersey-newark-city-113055436513a7ac0f1d875cd5913e01?utm_source=homepage&utm_medium=TopNews&utm_campaign=position_04')

In [32]:
nlp = en_core_web_sm.load()
ny_bb = document
article = nlp(ny_bb)
displacy.render(article, jupyter=True, style='ent')

In [33]:
article

1 December 2021, the FARDC fought back near Goma and recaptured a position from M23 militia. The MONUSCO supplied their advances. Start was given to the operation! As said the representative of the red cross, "kaboom!". unhcr confirmed the news. Monika allocated the assets near komanda village. floribert said that he was in the car. Sputnik news agency was on the spot. Kinshasa welcomed the delegation. In Kinshasa the delegation was welcomed by the minister. CCTV covered the incident while un people protested. Mai-Mai Yakutumba attacked Nyamilima. Mai-Mai Raia Mutomboki surrendered on Friday. UPC collected 10000 francs to participate in the elections next year. New Technoligies is going to import coltan from Katanga. The President Tchisekedi rejected their claims. Major Solana confirmed the crash of a helicopter near Kabalo. 300000 IDPs were registered in Kongolo. Twenty Twa were killed in the massacre. The conflict between Batende and Banunu tribes disrupted the elections in Mai-Ndomb

In [34]:
type(article)

spacy.tokens.doc.Doc

In [35]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'senter',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [36]:
article.ents

(1 December 2021,
 FARDC,
 Goma,
 M23 militia,
 the red cross,
 unhcr,
 komanda village,
 floribert,
 Kinshasa,
 Kinshasa,
 CCTV,
 un,
 Mai-Mai Yakutumba,
 Nyamilima,
 Mai-Mai Raia Mutomboki,
 Friday,
 UPC,
 10000 francs,
 next year,
 New Technoligies,
 Katanga,
 Tchisekedi,
 Solana,
 Kabalo,
 300000,
 Kongolo,
 Twenty,
 Batende,
 Banunu,
 Mai-Ndombe,
 Fifty,
 Ubangi,
 Fifty,
 the Congo River,
 last weekends,
 CAID,
 Kanakake Kagame,
 air france,
 next week,
 DDRRR,
 FDLR,
 FARDC,
 Putin,
 Obama,
 coca-cola,
 Loran Nkunda,
 Rwanda,
 Nyiragongo,
 Virunga,
 Baka,
 MM Yakutumba,
 PNC,
 ANR,
 Trafaret Kirzun,
 Presidentielle Ancienne,
 Saint Petersburg,
 Madonna,
 Manila,
 Mai-Mai Malaika,
 Kabambare)

In [37]:
[(_.text, _.label_) for _ in article.ents]

[('1 December 2021', 'DATE'),
 ('FARDC', 'ORG'),
 ('Goma', 'PERSON'),
 ('M23 militia', 'ORG'),
 ('the red cross', 'ORG'),
 ('unhcr', 'ORG'),
 ('komanda village', 'GPE'),
 ('floribert', 'PERSON'),
 ('Kinshasa', 'ORG'),
 ('Kinshasa', 'GPE'),
 ('CCTV', 'ORG'),
 ('un', 'ORG'),
 ('Mai-Mai Yakutumba', 'ORG'),
 ('Nyamilima', 'GPE'),
 ('Mai-Mai Raia Mutomboki', 'PERSON'),
 ('Friday', 'DATE'),
 ('UPC', 'ORG'),
 ('10000 francs', 'MONEY'),
 ('next year', 'DATE'),
 ('New Technoligies', 'ORG'),
 ('Katanga', 'GPE'),
 ('Tchisekedi', 'PERSON'),
 ('Solana', 'PERSON'),
 ('Kabalo', 'GPE'),
 ('300000', 'CARDINAL'),
 ('Kongolo', 'GPE'),
 ('Twenty', 'CARDINAL'),
 ('Batende', 'ORG'),
 ('Banunu', 'ORG'),
 ('Mai-Ndombe', 'LOC'),
 ('Fifty', 'CARDINAL'),
 ('Ubangi', 'ORG'),
 ('Fifty', 'CARDINAL'),
 ('the Congo River', 'LOC'),
 ('last weekends', 'DATE'),
 ('CAID', 'ORG'),
 ('Kanakake Kagame', 'PERSON'),
 ('air france', 'ORG'),
 ('next week', 'DATE'),
 ('DDRRR', 'ORG'),
 ('FDLR', 'ORG'),
 ('FARDC', 'ORG'),
 ('Puti

In [38]:
# GPE - Geopolitical entity, i.e. countries, cities, states.

### spacy NER in French

In [39]:
# !python -m spacy download fr_core_news_sm

In [40]:
import fr_core_news_sm

In [41]:
document = "Une personne a été tuée par balles et cinq autres enlevées dans la nuit de lundi à mardi 26 avril lors d’une embuscade armée près du village d’Etchibe situé à 16 km au sud de la ville de Baraka (Sud-Kivu). En effet, un groupe de sept combattants présumés de Maï-Maï Yakutumba avait fait une intrusion dans trois maisons locales à Misisi-Centre vers 23h (21h TU). Dépêchée sur le lieu, la patrouille mixte FARDC-PNC a réussi à capturer l'un des auteurs avec son arme. Tandis que d'autres criminels ont réussi à s'échapper sans être remarqués. D'autres sources indiquent que le même groupe de criminels a enlevé cinq civils à bord d'un véhicule privé. Informé de cette situation, le commandant du 2202e régiment des FARDC, le colonel David Ipanga, dit avoir dépêché une patrouille de combat dans la zone touchée. Cependant, aucun autre détail n'a été fourni. Pour l’instant, des sources sécuritaires à Misisi signalent que le coupable est détenu par les FARDC pour un nouvel interrogatoire et une procédure judiciaire."

In [42]:
nlp = fr_core_news_sm.load()
ny_bb = document
article = nlp(ny_bb)
displacy.render(article, jupyter=True, style='ent')

## There are better models https://spacy.io/models/fr with a better performance!!!

https://www.youtube.com/watch?v=2XUhKpH0p4M

In [43]:
import spacy
import en_core_web_sm

In [44]:
document = "Tesla Inc is going to aquire twitter for $45 billion"

nlp = en_core_web_sm.load()
document_processed = nlp(document)

In [45]:
type(document_processed)

spacy.tokens.doc.Doc

In [46]:
document_processed.ents

(Tesla Inc, $45 billion)

In [47]:
document_processed.ents[0]

Tesla Inc

In [48]:
document_processed.ents[0].text

'Tesla Inc'

In [49]:
document_processed.ents[0].label_

'ORG'

In [50]:
spacy.explain(document_processed.ents[0].label_)

'Companies, agencies, institutions, etc.'

In [51]:
for entity in document_processed.ents:
    print(entity.text, '|', entity.label_, '|', spacy.explain(entity.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [52]:
from spacy import displacy 

displacy.render(document_processed, jupyter=True, style='ent')

twitter was not recognized, let's capitalize it

In [53]:
document = "Tesla Inc is going to aquire Twitter for $45 billion"

nlp = en_core_web_sm.load()
document_processed = nlp(document)

In [54]:
displacy.render(document_processed, jupyter=True, style='ent')

In [55]:
document = "moscow was founded in 988 bc"

nlp = en_core_web_sm.load()
document_processed = nlp(document)

In [56]:
displacy.render(document_processed, jupyter=True, style='ent')

In [57]:
for entity in document_processed.ents:
    print(entity.text, '|', entity.label_, '|', spacy.explain(entity.label_))

moscow | GPE | Countries, cities, states
988 | CARDINAL | Numerals that do not fall under another type


In [58]:
type(document_processed)

spacy.tokens.doc.Doc

In [59]:
type(document_processed[0])

spacy.tokens.token.Token

In [60]:
document_processed[1:]

was founded in 988 bc

# transformers

https://huggingface.co/dslim/bert-base-NER?text=My+name+is+Clara+and+I+live+in+Berkeley%2C+California.

bert-base-NER is a fine-tuned BERT model that is ready to use for Named Entity Recognition and achieves state-of-the-art performance for the NER task. It has been trained to recognize four types of entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC).

In [61]:
# !pip install transformers

In [62]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [63]:
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [64]:
example = "Luis and Alice were killed in Blumenau"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.9992606, 'index': 1, 'word': 'Luis', 'start': 0, 'end': 4}, {'entity': 'B-PER', 'score': 0.9975923, 'index': 3, 'word': 'Alice', 'start': 9, 'end': 14}, {'entity': 'B-LOC', 'score': 0.9946719, 'index': 7, 'word': 'Blu', 'start': 30, 'end': 33}, {'entity': 'I-LOC', 'score': 0.99243355, 'index': 8, 'word': '##men', 'start': 33, 'end': 36}, {'entity': 'I-LOC', 'score': 0.75294447, 'index': 9, 'word': '##au', 'start': 36, 'end': 38}]


In [65]:
ner_results[2:]  # Blumenau

[{'entity': 'B-LOC',
  'score': 0.9946719,
  'index': 7,
  'word': 'Blu',
  'start': 30,
  'end': 33},
 {'entity': 'I-LOC',
  'score': 0.99243355,
  'index': 8,
  'word': '##men',
  'start': 33,
  'end': 36},
 {'entity': 'I-LOC',
  'score': 0.75294447,
  'index': 9,
  'word': '##au',
  'start': 36,
  'end': 38}]

In [66]:
example = "My name is Evgenii, and I live in Saint Petersburg"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.9828704, 'index': 4, 'word': 'E', 'start': 11, 'end': 12}, {'entity': 'B-PER', 'score': 0.68019027, 'index': 5, 'word': '##v', 'start': 12, 'end': 13}, {'entity': 'B-PER', 'score': 0.8987502, 'index': 6, 'word': '##gen', 'start': 13, 'end': 16}, {'entity': 'B-PER', 'score': 0.5416685, 'index': 7, 'word': '##ii', 'start': 16, 'end': 18}, {'entity': 'B-LOC', 'score': 0.9991812, 'index': 13, 'word': 'Saint', 'start': 34, 'end': 39}, {'entity': 'I-LOC', 'score': 0.9991375, 'index': 14, 'word': 'Petersburg', 'start': 40, 'end': 50}]


In [67]:
ner_results[:4]  # Evgenii

[{'entity': 'B-PER',
  'score': 0.9828704,
  'index': 4,
  'word': 'E',
  'start': 11,
  'end': 12},
 {'entity': 'B-PER',
  'score': 0.68019027,
  'index': 5,
  'word': '##v',
  'start': 12,
  'end': 13},
 {'entity': 'B-PER',
  'score': 0.8987502,
  'index': 6,
  'word': '##gen',
  'start': 13,
  'end': 16},
 {'entity': 'B-PER',
  'score': 0.5416685,
  'index': 7,
  'word': '##ii',
  'start': 16,
  'end': 18}]

In [68]:
example = "Nikola Tesla"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.99947244, 'index': 1, 'word': 'Nikola', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.99921703, 'index': 2, 'word': 'Te', 'start': 7, 'end': 9}, {'entity': 'I-PER', 'score': 0.9988186, 'index': 3, 'word': '##sla', 'start': 9, 'end': 12}]
