In [1]:
# https://github.com/mit-nlp/MITIE/blob/master/examples/python/ner.py

In [8]:
import sys, os
sys.path.append( '/home/jdm/Documents/MITIE/mitielib' )

In [9]:
from mitie import *
from collections import defaultdict

# English

In [10]:
print "loading NER model..."
ner = named_entity_extractor('/home/jdm/Documents/MITIE/MITIE-models/english/ner_model.dat')
print "\nTags output by this NER model:", ner.get_possible_ner_tags()

loading NER model...

Tags output by this NER model: ['PERSON', 'LOCATION', 'ORGANIZATION', 'MISC']


In [11]:
tokens = tokenize(load_entire_file('huffpost.txt'))
print "Tokenized input:", tokens

Tokenized input: ['A', 'new', 'Des', 'Moines', 'Register/Bloomberg', 'Politics', 'Iowa', 'Poll', 'shows', 'Sen', '.', 'Bernie', 'Sanders', '(', 'I-Vt', ')', 'only', '7', 'points', 'behind', 'Hillary', 'Clinton', 'in', 'the', 'race', 'for', 'the', 'Iowa', 'caucus', ',', 'a', 'worrying', 'sign', 'for', 'the', 'Democratic', 'frontrunner', '.', 'Clinton', 'leads', 'with', '37', 'percent', 'of', 'likely', 'Democratic', 'caucusgoers', ',', 'with', 'Sanders', 'following', 'at', '30', 'percent', '.', 'As', 'Clinton', "'s", 'campaign', 'struggles', 'to', 'counter', 'negative', 'press', 'from', 'her', 'ongoing', 'email', 'controversy', ',', 'Sanders', 'has', 'energized', 'liberal', 'Democrats', 'with', 'impassioned', 'talk', 'of', 'political', 'revolution', '.', 'According', 'to', 'the', 'poll', ',', '96', 'percent', 'of', 'Sanders', 'supporters', 'said', 'they', 'support', 'him', 'for', 'his', 'ideas', ',', 'while', 'two', 'percent', 'said', 'their', 'support', 'lies', 'mostly', 'in', 'the', 'f

In [12]:
entities = ner.extract_entities(tokens)
print "\nEntities found:", entities
print "\nNumber of entities detected:", len(entities)


Entities found: [(xrange(2, 4), 'LOCATION', 0.5787063827826189), (xrange(11, 13), 'PERSON', 1.609791817768582), (xrange(14, 15), 'ORGANIZATION', 0.15308382940440127), (xrange(20, 22), 'PERSON', 1.1606512033927832), (xrange(27, 28), 'LOCATION', 1.0285458269016259), (xrange(35, 36), 'MISC', 1.1192519125942966), (xrange(38, 39), 'PERSON', 1.2148000906666838), (xrange(45, 46), 'MISC', 1.08173265323975), (xrange(49, 50), 'PERSON', 0.910786082179623), (xrange(56, 57), 'PERSON', 1.2816359618203856), (xrange(70, 71), 'PERSON', 1.1703734164855595), (xrange(74, 75), 'MISC', 0.6758088724975834), (xrange(90, 91), 'PERSON', 0.7118674448509008), (xrange(116, 117), 'PERSON', 1.3339312926160005), (xrange(124, 126), 'PERSON', 1.7778538116900533), (xrange(141, 142), 'PERSON', 1.210804973119619), (xrange(162, 163), 'PERSON', 1.2617324590991177), (xrange(172, 174), 'LOCATION', 0.6882076067551202), (xrange(175, 178), 'ORGANIZATION', 0.3769465400595685), (xrange(197, 200), 'PERSON', 1.2276059683956866), (x

In [13]:
# entities is a list of tuples, each containing an xrange that indicates which
# tokens are part of the entity, the entity tag, and an associate score.  The
# entities are also listed in the order they appear in the input text file.
# Here we just print the score, tag, and text for each entity to the screen.
# The larger the score the more confident MITIE is in its prediction.
for e in entities:
    range = e[0]
    tag = e[1]
    score = e[2]
    score_text = "{:0.3f}".format(score)
    entity_text = " ".join(tokens[i] for i in range)
    print "   Score: " + score_text + ": " + tag + ": " + entity_text

   Score: 0.579: LOCATION: Des Moines
   Score: 1.610: PERSON: Bernie Sanders
   Score: 0.153: ORGANIZATION: I-Vt
   Score: 1.161: PERSON: Hillary Clinton
   Score: 1.029: LOCATION: Iowa
   Score: 1.119: MISC: Democratic
   Score: 1.215: PERSON: Clinton
   Score: 1.082: MISC: Democratic
   Score: 0.911: PERSON: Sanders
   Score: 1.282: PERSON: Clinton
   Score: 1.170: PERSON: Sanders
   Score: 0.676: MISC: Democrats
   Score: 0.712: PERSON: Sanders
   Score: 1.334: PERSON: Clinton
   Score: 1.778: PERSON: Joe Biden
   Score: 1.211: PERSON: Clinton
   Score: 1.262: PERSON: Clinton
   Score: 0.688: LOCATION: Des Moines
   Score: 0.377: ORGANIZATION: Politics Iowa Poll
   Score: 1.228: PERSON: J. Ann Selzer
   Score: 0.901: ORGANIZATION: Selzer & Co
   Score: 1.223: PERSON: Clinton
   Score: 1.226: PERSON: Michele Bachmann
   Score: 0.955: MISC: Republican
   Score: 1.074: PERSON: Mitt Romney
   Score: 1.506: LOCATION: Iowa
   Score: 0.688: PERSON: Bachmann
   Score: 0.811: MISC: Democrat

In [14]:
# Now let's run one of MITIE's binary relation detectors.  MITIE comes with a
# bunch of different types of relation detector and includes tools allowing you
# to train new detectors.  However, here we simply use one, the "person born in
# place" relation detector.
rel_detector = binary_relation_detector("/home/jdm/Documents/MITIE/MITIE-models/english/binary_relations/rel_classifier_people.person.place_of_birth.svm")


In [15]:
# First, let's make a list of neighboring entities.  Once we have this list we
# will ask the relation detector if any of these entity pairs is an example of
# the "person born in place" relation. 
neighboring_entities = [(entities[i][0], entities[i+1][0]) for i in xrange(len(entities)-1)]
# Also swap the entities and add those in as well.  We do this because "person
# born in place" mentions can appear in the text in as "place is birthplace of
# person".  So we must consider both possible orderings of the arguments.
neighboring_entities += [(r,l) for (l,r) in neighboring_entities]

In [16]:
# Now that we have our list, let's check each entity pair and see which one the
# detector selects.
for person, place in neighboring_entities:
    # Detection has two steps in MITIE. First, you convert a pair of entities
    # into a special representation.
    rel = ner.extract_binary_relation(tokens, person, place)
    # Then you ask the detector to classify that pair of entities.  If the
    # score value is > 0 then it is saying that it has found a relation.  The
    # larger the score the more confident it is.  Finally, the reason we do
    # detection in two parts is so you can reuse the intermediate rel in many
    # calls to different relation detectors without needing to redo the
    # processing done in extract_binary_relation().
    score = rel_detector(rel)
    # Print out any matching relations.
    if (score > 0):
        person_text     = " ".join(tokens[i] for i in person)
        birthplace_text = " ".join(tokens[i] for i in place)
        print person_text, "BORN_IN", birthplace_text

In [17]:
# The code above shows the basic details of MITIE's relation detection API.
# However, it is important to note that real world data is noisy any confusing.
# Not all detected relations will be correct.  Therefore, it's important to
# aggregate many relation detections together to get the best signal out of
# your data.  A good way to do this is to pick an entity you are in interested
# in (e.g. Benjamin Franklin) and then find all the relations that mention him
# and order them by most frequent to least frequent.  We show how to do this in
# the code below.
query = "Clinton"
hits = defaultdict(int)

for person, place in neighboring_entities:
    rel = ner.extract_binary_relation(tokens, person, place)
    score = rel_detector(rel)
    if (score > 0):
        person_text     = " ".join(tokens[i] for i in person)
        birthplace_text = " ".join(tokens[i] for i in place)
        if (person_text == query):
            hits[birthplace_text] += 1

print "\nTop most common relations:"
for place, count in sorted(hits.iteritems(), key=lambda x:x[1], reverse=True):
    print count, "relations claiming", query, "was born in", place


Top most common relations:


# Spanish

In [20]:
print "loading NER model..."
ner = named_entity_extractor('/home/jdm/Documents/MITIE/MITIE-models/spanish/ner_model.dat')
print "\nTags output by this NER model:", ner.get_possible_ner_tags()

loading NER model...

Tags output by this NER model: ['LOCATION', 'ORGANIZATION', 'PERSON', 'MISC']


In [21]:
tokens = tokenize(load_entire_file('noticel.txt'))
print "Tokenized input:", tokens

Tokenized input: ['El', 'Departamento', 'de', 'Salud', 'se', 'debe', 'limitar', 'a', 'regular', ',', 'no', 'monopolizar', ',', 'el', 'cultivo', 'y', 'distribuci\xc3\xb3n', 'de', 'marihuana', 'medicinal', 'mediante', 'la', 'emisi\xc3\xb3n', 'de', 'licencias', 'a', 'entidades', 'privadas', '.', 'Esa', 'fue', 'la', 'primera', 'recomendaci\xc3\xb3n', 'hecha', 'por', 'Chloe', 'Villano', 'durante', 'las', 'vistas', 'p\xc3\xbablicas', 'organizadas', 'por', 'el', 'Departamento', 'de', 'Salud', 'para', 'discutir', 'la', 'creaci\xc3\xb3n', 'de', 'un', 'c\xc3\xb3digo', 'que', 'regule', 'toda', 'la', 'industria', 'naciente', 'de', 'la', 'marihuana', 'medicinal', 'desde', 'su', 'cultivo', 'hasta', 'su', 'consumo', '.', 'Chloe', 'Villano', 'es', 'la', 'due\xc3\xb1a', 'y', 'fundadora', 'de', 'Clover', 'Leaf', 'Consulting', ',', 'una', 'empresa', 'que', 'a', 'trav\xc3\xa9s', 'de', 'los', 'a\xc3\xb1os', 'se', 'ha', 'convertido', 'en', 'una', 'de', 'las', 'm\xc3\xa1s', 'importantes', 'empresas', 'de', '

In [22]:
entities = ner.extract_entities(tokens)
print "\nEntities found:", entities
print "\nNumber of entities detected:", len(entities)


Entities found: [(xrange(1, 4), 'ORGANIZATION', 1.3366524210836725), (xrange(36, 38), 'PERSON', 0.9492931680584611), (xrange(41, 42), 'MISC', 0.504739755962439), (xrange(45, 48), 'ORGANIZATION', 1.0834818272251523), (xrange(72, 74), 'PERSON', 0.8660110833516398), (xrange(80, 83), 'PERSON', 0.31842279160268355), (xrange(110, 111), 'PERSON', 0.5562643447516701), (xrange(197, 198), 'PERSON', 0.4586737874118581), (xrange(205, 207), 'LOCATION', 0.8586481099601169), (xrange(210, 211), 'LOCATION', 0.26606527191122015), (xrange(216, 218), 'LOCATION', 0.4503685200972449), (xrange(219, 220), 'ORGANIZATION', 0.18259764846324697), (xrange(229, 231), 'ORGANIZATION', 0.9769760156225974), (xrange(247, 248), 'PERSON', 0.6024071840913054), (xrange(261, 262), 'LOCATION', 0.5749381328214171), (xrange(284, 285), 'LOCATION', 1.0152696067231652), (xrange(302, 303), 'ORGANIZATION', 1.5331246010075268), (xrange(355, 358), 'ORGANIZATION', 1.111589922362682), (xrange(363, 366), 'ORGANIZATION', 1.26402269723714

In [23]:
# entities is a list of tuples, each containing an xrange that indicates which
# tokens are part of the entity, the entity tag, and an associate score.  The
# entities are also listed in the order they appear in the input text file.
# Here we just print the score, tag, and text for each entity to the screen.
# The larger the score the more confident MITIE is in its prediction.
for e in entities:
    range = e[0]
    tag = e[1]
    score = e[2]
    score_text = "{:0.3f}".format(score)
    entity_text = " ".join(tokens[i] for i in range)
    print "   Score: " + score_text + ": " + tag + ": " + entity_text

   Score: 1.337: ORGANIZATION: Departamento de Salud
   Score: 0.949: PERSON: Chloe Villano
   Score: 0.505: MISC: públicas
   Score: 1.083: ORGANIZATION: Departamento de Salud
   Score: 0.866: PERSON: Chloe Villano
   Score: 0.318: PERSON: Clover Leaf Consulting
   Score: 0.556: PERSON: Villano
   Score: 0.459: PERSON: Villano
   Score: 0.859: LOCATION: Puerto Rico
   Score: 0.266: LOCATION: NotiCel
   Score: 0.450: LOCATION: Puerto Rico
   Score: 0.183: ORGANIZATION: haya
   Score: 0.977: ORGANIZATION: Estados Unidos
   Score: 0.602: PERSON: Villano
   Score: 0.575: LOCATION: Caribe
   Score: 1.015: LOCATION: Colorado
   Score: 1.533: ORGANIZATION: Gobierno
   Score: 1.112: ORGANIZATION: Departamento de Salud
   Score: 1.264: ORGANIZATION: Departamento de Salud
   Score: 0.689: PERSON: Sherman
   Score: 1.087: MISC: Ley de Sustancias Controladas
   Score: 0.333: LOCATION: . Además
   Score: 1.527: ORGANIZATION: Gobierno
   Score: 0.612: LOCATION: Puerto Rico
   Score: 0.444: PERSON: 