In [1]:
import conll
import spacy
import nltk
import pandas as pd
from spacy.tokens import Doc
import os
from sklearn.metrics import classification_report

ModuleNotFoundError: ignored

In [2]:
nlp = spacy.load("en_core_web_sm")
doc = conll.read_corpus_conll("train.txt", " ")

refs = [[(text,iob) for text,pos,syn_chunk,iob in sent if text != '-DOCSTART-']for sent in doc]
refs = [x for x in refs if x]

In [3]:
sentences = []

for sent in refs:
    tmp = []
    for i in range(len(sent)):
        tmp.append(sent[i][0])
    sentences.append(tmp)

print(sentences[0])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [4]:
def tag_Mapping(tag):
  conll_tags = {'PER', 'ORG', 'LOC', 'MISC'}
  if tag == 'PERSON':
    tag = 'PER'
  elif tag == 'GPE' or tag == 'LOC':
      tag = 'LOC'
  elif tag == 'NORP' or tag == 'ORG' or tag == 'FAC':
    tag = 'ORG'
  elif tag not in conll_tags:
    tag = 'MISC'
  
  return tag

In [5]:
hyps = []
for sent in sentences:
    doc = Doc(nlp.vocab, words=sent)
    for name, proc in nlp.pipeline:
        doc = proc(doc)
    for tok in doc:
      tok.ent_type_ = tag_Mapping(tok.ent_type_)
    
    hyps.append(([(t.text, t.ent_iob_) if t.ent_iob_ == 'O' else  (t.text,t.ent_iob_+ '-' + t.ent_type_) for t in doc]))

In [6]:
print("HYPS: ",hyps[0])
print("REFS: ",refs[0])

HYPS:  [('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-ORG'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('British', 'B-ORG'), ('lamb', 'O'), ('.', 'O')]
REFS:  [('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-MISC'), ('call', 'O'), ('to', 'O'), ('boycott', 'O'), ('British', 'B-MISC'), ('lamb', 'O'), ('.', 'O')]


In [149]:
def token_level_performance(refs,hyps):
#TOKEN LEVEL PERFORMANCE
  y_true = []
  y_pred = []
  for i in range(len(refs)):
      for j in range(len(refs[i])):
          y_true.append(refs[i][j][-1])
          y_pred.append(hyps[i][j][-1])

  print(classification_report(y_true, y_pred))

In [150]:
token_level_performance(refs,hyps)

              precision    recall  f1-score   support

       B-LOC       0.81      0.71      0.75      7140
      B-MISC       0.01      0.06      0.02      3438
       B-ORG       0.32      0.34      0.33      6321
       B-PER       0.76      0.66      0.71      6600
       I-LOC       0.59      0.63      0.61      1157
      I-MISC       0.03      0.20      0.06      1155
       I-ORG       0.45      0.59      0.51      3704
       I-PER       0.72      0.83      0.77      4528
           O       0.96      0.86      0.91    169578

    accuracy                           0.81    203621
   macro avg       0.52      0.54      0.52    203621
weighted avg       0.89      0.81      0.85    203621



In [8]:
#Chunk level performance
results = conll.evaluate(refs,hyps)

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
LOC,0.796,0.698,0.744,7140
PER,0.724,0.634,0.676,6600
ORG,0.286,0.301,0.293,6321
MISC,0.01,0.04,0.016,3438
total,0.346,0.477,0.401,23499


In [9]:
def group_entities(doc):
    ents = []
    groups = []
    for nc in doc.noun_chunks:
        tmp = []
        for ent in nc.ents:
            if ent not in ents:
                tmp.append(ent.label_)
                ents.append(ent)
        if len(tmp) > 0:
            groups.append(sorted(tmp))          
    return groups

In [8]:
#!pip install -U spacy
#!python -m spacy download en_core_web_sm

In [10]:
nlp = spacy.load("en_core_web_sm")
groups = []
for sent in sentences:
    mergedSent = ' '.join(sent)
    doc = nlp(mergedSent)
    groups.append(group_entities(doc)) 

In [47]:
freq_dict = {}
for s_group in groups:
  for g in s_group:
    if len(g) != 0:
      key = "{}".format("---".join(sorted(g)))
      if key not in freq_dict:
        freq_dict[key] = 1
      else:
        freq_dict[key] += 1

In [48]:
d_view = [(v,k) for k,v in freq_dict.items()]
d_view.sort(reverse=True)
for v,k in d_view:
    print("%s: %d" % (k,v))

GPE: 5385
PERSON: 4247
ORG: 3428
DATE: 2297
CARDINAL: 1665
NORP: 1503
ORDINAL: 591
QUANTITY: 345
PERCENT: 224
TIME: 208
NORP---PERSON: 193
MONEY: 182
LOC: 181
ORG---PERSON: 134
GPE---PERSON: 134
CARDINAL---PERSON: 103
EVENT: 101
CARDINAL---NORP: 80
GPE---ORG: 79
FAC: 76
PRODUCT: 60
CARDINAL---ORG: 55
WORK_OF_ART: 44
GPE---GPE: 40
CARDINAL---GPE: 35
PERSON---PERSON: 34
ORDINAL---PERSON: 32
GPE---NORP: 32
DATE---GPE: 31
ORG---ORG: 29
NORP---ORG: 29
DATE---TIME: 28
NORP---NORP: 27
DATE---ORG: 27
CARDINAL---ORDINAL: 27
LANGUAGE: 23
CARDINAL---CARDINAL: 22
DATE---NORP: 21
GPE---ORDINAL: 20
DATE---PERSON: 20
ORG---PRODUCT: 19
NORP---ORDINAL: 16
CARDINAL---DATE: 16
ORDINAL---ORG: 14
LAW: 14
GPE---ORG---PERSON: 11
GPE---LOC: 11
FAC---GPE: 10
EVENT---GPE: 10
DATE---ORDINAL: 10
DATE---EVENT: 8
DATE---DATE: 8
ORDINAL---ORDINAL: 6
MONEY---ORG: 6
CARDINAL---PRODUCT: 6
PERSON---TIME: 4
NORP---PERSON---PERSON: 4
DATE---PERCENT: 4
DATE---NORP---PERSON: 4
DATE---LOC: 4
DATE---FAC: 4
CARDINAL---ORG---PR

In [145]:
def exstend_entity_span(doc):
  ent_iob = [t.ent_iob_ for t in doc]
  ent_types = [t.ent_type_ for t in doc]
  
  for token in doc:
      if token.dep_ == 'compound' and token.head.ent_type_ != "":     # check the dependencies
          ent_types[token.i] = token.head.ent_type_                   # change the entity

          # put the IOB tag
          if token.head.i < token.i:
              ent_iob[token.i] = 'I'
          elif token.head.ent_iob_ == 'B':
              ent_iob[token.head.i] = 'I'
              ent_iob[token.i] = 'B'
          else:
              ent_iob[token.i] = 'B'

  return [(t.text, ei) if ei == 'O' else  (t.text,ei+ '-' + tag_Mapping(et)) for t, ei, et in zip(doc,ent_iob,ent_types)]

In [146]:
results = []
for sent in sentences:
    mergedSent = ' '.join(sent)
    doc = nlp(mergedSent)
    results.append(exstend_entity_span(doc))

In [153]:
token_level_performance(refs,results)

              precision    recall  f1-score   support

       B-LOC       0.67      0.61      0.64      7140
      B-MISC       0.02      0.07      0.03      3438
       B-ORG       0.27      0.35      0.30      6321
       B-PER       0.52      0.53      0.52      6600
       I-LOC       0.56      0.54      0.55      1157
      I-MISC       0.02      0.14      0.04      1155
       I-ORG       0.40      0.38      0.39      3704
       I-PER       0.57      0.60      0.59      4528
           O       0.94      0.83      0.88    169578

    accuracy                           0.77    203621
   macro avg       0.44      0.45      0.44    203621
weighted avg       0.85      0.77      0.81    203621

