# NER

## NLTK

In [1]:
import re
import pandas as pd
import nltk
# nltk.download()

In [2]:
def parse_document(document):
    document = re.sub('\n', ' ', document)
    # isinstance() 函数来判断一个对象是否是一个已知的类型，类似 type()。
    # isinstance(object, classinfo)其中object -- 实例对象。classinfo -- 可以是直接或间接类名、基本类型或者由它们组成的元组。返回布尔值
    if isinstance(document, str):
        document = document
    else:
        raise ValueError('Document is not string!')
    document = document.strip() # 去掉首尾的空格
    sentences = nltk.sent_tokenize(document) # 按句子分割(返回一个句子列表)  nltk.word_tokenize(sentence) 分词
    sentences = [sentence.strip() for sentence in sentences] # 把句子列表中的每个句子去掉首尾空格
    return sentences # 句子列表(列表中的每个元素还是句子)

In [3]:
# sample document
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, 
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its 
membership now comprises 211 national associations. Member countries must each also be members of one of 
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America 
and the Caribbean, Oceania, and South America.
"""

In [4]:
# tokenize sentences
# 每个句子为一个str类型
sentences = parse_document(text)
sentences

['FIFA was founded in 1904 to oversee international competition among the national associations of Belgium,  Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland.',
 'Headquartered in Zürich, its  membership now comprises 211 national associations.',
 'Member countries must each also be members of one of  the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America  and the Caribbean, Oceania, and South America.']

In [5]:
# 分词，返回值为列表的列表，内层列表中的每个元素是每个句子的分词[[第一个句子的各个分词], [第二个句子的各个分词],'''''']
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] 
tokenized_sentences[0][:5]

['FIFA', 'was', 'founded', 'in', '1904']

In [6]:
# 标记句子并使用nltk的NE块，返回值同分词返回值
# 列表的列表，最内层元素为一个由分词和词性构成的元组，然后一个句子的所有分词及词性的元组构成一个小列表，每个句子的小列表构成一个大列表
# pos_tag方法（part-of-speech tagging词性标注）
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
tagged_sentences

[[('FIFA', 'NNP'),
  ('was', 'VBD'),
  ('founded', 'VBN'),
  ('in', 'IN'),
  ('1904', 'CD'),
  ('to', 'TO'),
  ('oversee', 'VB'),
  ('international', 'JJ'),
  ('competition', 'NN'),
  ('among', 'IN'),
  ('the', 'DT'),
  ('national', 'JJ'),
  ('associations', 'NNS'),
  ('of', 'IN'),
  ('Belgium', 'NNP'),
  (',', ','),
  ('Denmark', 'NNP'),
  (',', ','),
  ('France', 'NNP'),
  (',', ','),
  ('Germany', 'NNP'),
  (',', ','),
  ('the', 'DT'),
  ('Netherlands', 'NNP'),
  (',', ','),
  ('Spain', 'NNP'),
  (',', ','),
  ('Sweden', 'NNP'),
  (',', ','),
  ('and', 'CC'),
  ('Switzerland', 'NNP'),
  ('.', '.')],
 [('Headquartered', 'VBN'),
  ('in', 'IN'),
  ('Zürich', 'NNP'),
  (',', ','),
  ('its', 'PRP$'),
  ('membership', 'NN'),
  ('now', 'RB'),
  ('comprises', 'VBZ'),
  ('211', 'CD'),
  ('national', 'JJ'),
  ('associations', 'NNS'),
  ('.', '.')],
 [('Member', 'NNP'),
  ('countries', 'NNS'),
  ('must', 'MD'),
  ('each', 'DT'),
  ('also', 'RB'),
  ('be', 'VB'),
  ('members', 'NNS'),
  ('of', 

In [7]:
ners = nltk.ne_chunk(tagged_sentences[0])
type(ners)
[ners]

nltk.tree.Tree

[Tree('S', [Tree('ORGANIZATION', [('FIFA', 'NNP')]), ('was', 'VBD'), ('founded', 'VBN'), ('in', 'IN'), ('1904', 'CD'), ('to', 'TO'), ('oversee', 'VB'), ('international', 'JJ'), ('competition', 'NN'), ('among', 'IN'), ('the', 'DT'), ('national', 'JJ'), ('associations', 'NNS'), ('of', 'IN'), Tree('GPE', [('Belgium', 'NNP')]), (',', ','), Tree('GPE', [('Denmark', 'NNP')]), (',', ','), Tree('GPE', [('France', 'NNP')]), (',', ','), Tree('GPE', [('Germany', 'NNP')]), (',', ','), ('the', 'DT'), Tree('GPE', [('Netherlands', 'NNP')]), (',', ','), Tree('GPE', [('Spain', 'NNP')]), (',', ','), Tree('GPE', [('Sweden', 'NNP')]), (',', ','), ('and', 'CC'), Tree('GPE', [('Switzerland', 'NNP')]), ('.', '.')])]

In [8]:
# tags是句子词性标注后的结果，同样是句子级
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences] 
# 三个句子对应三棵树
len(ne_chunked_sents)
ne_chunked_sents

3

[Tree('S', [Tree('ORGANIZATION', [('FIFA', 'NNP')]), ('was', 'VBD'), ('founded', 'VBN'), ('in', 'IN'), ('1904', 'CD'), ('to', 'TO'), ('oversee', 'VB'), ('international', 'JJ'), ('competition', 'NN'), ('among', 'IN'), ('the', 'DT'), ('national', 'JJ'), ('associations', 'NNS'), ('of', 'IN'), Tree('GPE', [('Belgium', 'NNP')]), (',', ','), Tree('GPE', [('Denmark', 'NNP')]), (',', ','), Tree('GPE', [('France', 'NNP')]), (',', ','), Tree('GPE', [('Germany', 'NNP')]), (',', ','), ('the', 'DT'), Tree('GPE', [('Netherlands', 'NNP')]), (',', ','), Tree('GPE', [('Spain', 'NNP')]), (',', ','), Tree('GPE', [('Sweden', 'NNP')]), (',', ','), ('and', 'CC'), Tree('GPE', [('Switzerland', 'NNP')]), ('.', '.')]),
 Tree('S', [('Headquartered', 'VBN'), ('in', 'IN'), Tree('GPE', [('Zürich', 'NNP')]), (',', ','), ('its', 'PRP$'), ('membership', 'NN'), ('now', 'RB'), ('comprises', 'VBZ'), ('211', 'CD'), ('national', 'JJ'), ('associations', 'NNS'), ('.', '.')]),
 Tree('S', [('Member', 'NNP'), ('countries', 'NNS

In [9]:
type(ne_chunked_sents)
type(ne_chunked_sents[0])
type(ne_chunked_sents[0][0])
hasattr(ne_chunked_sents[0][14], 'label')

list

nltk.tree.Tree

nltk.tree.Tree

True

In [10]:
x = ('GPE', [('B', 'C')])
hasattr(x, 'label')

False

In [11]:
# 提取所有NE
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:    
    for tagged_tree in ne_tagged_sentence:
        # print(type(tagged_tree))
        # 只有('ORGANIZATION', [('FIFA', 'NNP')])这种类型才可以提取NE标签
        if hasattr(tagged_tree, 'label'): # hasattr() 函数用于判断对象是否包含对应的属性
            # hasattr(object, name) object--对象    name--字符串，属性名
            entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) 
            # ()内返回一个元组，比如第一次为('FIFA', 'NNP')，经过 ' '.join(tuple) 后变成str，通过leaves()得到NE的名字
            entity_type = tagged_tree.label() # 通过label()得到NE的类别 
            named_entities.append((entity_name, entity_type)) # 把(NE名字,NE类别)组成的元组作为元素添加到named_entities列表中
            # 删除列表中重复的元素，得到最终的NE列表
            named_entities = list(set(named_entities))
# ne_chunked_sents[0][0].leaves()[0] # NE的类别 
# ne_chunked_sents[0][0].label() # NE的名字
# type(entity_name) 
# named_entities # 最终的NE列表

In [12]:
# 将NE存储在DataFrame中(list转化成Dataframe)
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# display results
print(entity_frame)

        Entity Name   Entity Type
0   Central America  ORGANIZATION
1           Germany           GPE
2            Sweden           GPE
3         Caribbean      LOCATION
4       Netherlands           GPE
5             Spain           GPE
6           Oceania           GPE
7            France           GPE
8           Belgium           GPE
9              FIFA  ORGANIZATION
10           Zürich           GPE
11           Europe           GPE
12    South America           GPE
13             Asia           GPE
14            North           GPE
15      Switzerland           GPE
16          Denmark           GPE
17           Africa        PERSON


## Stanford NER

In [13]:
import re
from nltk.tag import StanfordNERTagger
import os
import pandas as pd
import nltk

In [14]:
def parse_document(document):
   document = re.sub('\n', ' ', document)
   if isinstance(document, str):
       document = document
   else:
       raise ValueError('Document is not string!')
   document = document.strip()
   sentences = nltk.sent_tokenize(document)
   sentences = [sentence.strip() for sentence in sentences]
   return sentences

In [15]:
# sample document
text = """
FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, 
Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its 
membership now comprises 211 national associations. Member countries must each also be members of one of 
the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America 
and the Caribbean, Oceania, and South America.
"""

In [16]:
sentences = parse_document(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [17]:
# set java path in environment variables
java_path = r'E:\jdk\bin\java.exe'
os.environ['JAVAHOME'] = java_path
# load stanford NER
sn = StanfordNERTagger('E://stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz',
                       path_to_jar='E://stanford-ner-2018-10-16/stanford-ner.jar')

In [18]:
# type(sn.tag(tokenized_sentences[0]))
# sn.tag(tokenized_sentences[0])

In [19]:
# tag sentences
# 区别于NLTK的是Stanford NER把每个词都进行标注，非NE标注O表示Other
ne_annotated_sentences = [sn.tag(sent) for sent in tokenized_sentences]
# 返回值为[[(('FIFA', 'ORGANIZATION'),('was', 'O'),...)],[]...]
# 最内层是一个元组(分词，NE类别),一个小列表由一句话所有词的元组构成，每句话构成一个小列表作为大列表的一个元素
ne_annotated_sentences[0][0][1]

'ORGANIZATION'

In [20]:
# extract named entities
named_entities = []
for sentence in ne_annotated_sentences:
    temp_entity_name = ''
    temp_named_entity = None
    for term, tag in sentence: # term相当于元组中的第一个元素即NE名称，tag相当于元组中的第二个元素即NE类型
        # print(term)
        # print(tag)
        # get terms with NE tags
        if tag != 'O': # 如果NE类型不是Other(选择正确NE类型)
            temp_entity_name = ' '.join([temp_entity_name, term]).strip() # 得到NE名字
            temp_named_entity = (temp_entity_name, tag) # 得到(NE名字,NE类别)元组
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

In [21]:
# get unique named entities
named_entities = list(set(named_entities))
# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# display results
print(entity_frame)

                Entity Name   Entity Type
0                   Denmark      LOCATION
1                    France      LOCATION
2                    Europe      LOCATION
3                   Oceania      LOCATION
4                   Germany      LOCATION
5           the Netherlands      LOCATION
6                      Asia      LOCATION
7                 Caribbean      LOCATION
8                    Zürich      LOCATION
9                      FIFA  ORGANIZATION
10  North & Central America  ORGANIZATION
11            South America      LOCATION
12                   Sweden      LOCATION
13                     1904          DATE
14                    Spain      LOCATION
15              Switzerland      LOCATION
16                  Belgium      LOCATION
17                   Africa      LOCATION
