In [1]:
import nltk

In [23]:
doc = """Hi, how are you Martin? What's wrong?"""
nltk.sent_tokenize(doc)

['Hi, how are you Martin?', "What's wrong?"]

In [24]:
nltk.word_tokenize(doc)

['Hi', ',', 'how', 'are', 'you', 'Martin', '?', 'What', "'s", 'wrong', '?']

In [25]:
pos_tokens = nltk.pos_tag(doc)

In [26]:
nltk.ne_chunk(pos_tokens, binary=True),

(Tree('S', [('H', 'NNP'), ('i', 'NN'), (',', ','), (' ', 'NNP'), ('h', 'VBZ'), ('o', 'JJ'), ('w', 'NN'), (' ', 'VBD'), ('a', 'DT'), ('r', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('y', 'NNP'), ('o', 'MD'), ('u', 'VB'), (' ', 'NNP'), ('M', 'NNP'), ('a', 'DT'), ('r', 'NN'), ('t', 'NN'), ('i', 'NN'), ('n', 'RB'), ('?', '.'), (' ', 'JJ'), ('W', 'NNP'), ('h', 'NN'), ('a', 'DT'), ('t', 'NN'), ("'", 'POS'), ('s', 'NN'), (' ', 'NN'), ('w', 'NN'), ('r', 'NN'), ('o', 'NN'), ('n', 'NN'), ('g', 'NN'), ('?', '.')]),)

In [28]:
for sent in nltk.sent_tokenize(doc):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Hi
PERSON Martin


In [15]:
from sklearn.datasets import fetch_20newsgroups

In [21]:
news = fetch_20newsgroups()
news.data[1]

"From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"

In [31]:
from tqdm import tqdm_notebook as tqdm

In [32]:
entities = []
for sent in tqdm(nltk.sent_tokenize('\n'.join(news.data[:100]))):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            entities.append(chunk.label() + ': ' + ' '.join(c[0] for c in chunk))

HBox(children=(IntProgress(value=0, max=1470), HTML(value='')))




In [35]:
entities[0]

'ORGANIZATION: University'

In [None]:
# download https://ronan.collobert.com/senna/download.html
from nltk.tag import SennaTagger
tagger = SennaTagger('/usr/share/senna-v3.0')
tagger.tag('What is the airspeed of an unladen swallow ?'.split())

In [36]:
doc = """
For years, Microsoft Corporation CEO Bill Gates railed against the economic philosophy
of open-source software with Orwellian fervor, denouncing its communal licensing as a
”cancer”that stifled technological innovation.
Today, Microsoft claims to ”love”the open-source concept, by which software code is
made public to encourage improvement and development by outside programmers. Gates
himself says Microsoft will gladly disclose its crown jewels – the coveted code behind the
Windows operating system – to select customers.
”We can be open source. We love the concept of shared source,”said Bill Veghte, a
company VP. ”That’s a super-important shift for us in terms of code access.“
Richard Stallman, founder of the Free Software Foundation, countered saying..
""".strip()

In [37]:
label = {
    'organizations': ['Microsoft Corporation' , 'Microsoft' , 'Microsoft' , 'Free Software Foundation'],
    'positions': ['CEO', 'VP', 'founder'],
    'persons': ['Bill Gates' , 'Gates', 'Bill Veghte' , 'Richard Stallman'],
}
pred = {
    'organizations': ['Microsoft Corporation' , 'Free Software Foundation'],
    'positions': ['CEO', 'VP', 'founder'],
    'persons': ['Bill Gates' , 'Orwellian', 'Windows' , 'Bill Veghte', 'Microsoft VP', 'Richard Stallman'],
}

In [38]:
for c in label.keys():
    p = set(pred[c])
    l = set(label[c])
    
    prec = len(p & l) / len(p)
    rec = len(p & l) / len(l)
    print(f'Precision [{c}]: {prec}')
    print(f'Recall [{c}]: {rec}')
    print(f'F1 [{c}]: {2 * prec * rec / (prec + rec)}')
    print('-------------')

Precision [organizations]: 1.0
Recall [organizations]: 0.6666666666666666
F1 [organizations]: 0.8
-------------
Precision [positions]: 1.0
Recall [positions]: 1.0
F1 [positions]: 1.0
-------------
Precision [persons]: 0.5
Recall [persons]: 0.75
F1 [persons]: 0.6
-------------


In [40]:
import numpy as np

precisions = []
tp = []
pred_p = []

for c in label.keys():
    p = set(pred[c])
    l = set(label[c])
    
    prec = len(p & l) / len(p)
    precisions.append(prec)
    tp.append(len(p & l))
    pred_p.append(len(p))
    
print('Macro-Average precision:', np.mean(precisions))
print('Micro-Average precision:', np.sum(tp) / np.sum(pred_p))

Macro-Average precision: 0.8333333333333334
Micro-Average precision: 0.7272727272727273
