In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import sklearn
from sklearn import metrics
import nltk 
from collections import Counter
import matplotlib.pyplot as plt
import itertools
import ast
import jsonlines

nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

# ### BOOTLEG ###
# import import_ipynb
# import LoadEntityProfiles

[nltk_data] Downloading package stopwords to
[nltk_data]     /lfs/1/simran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Bootleg utility functions:
# BY ALIAS: 
def get_candidates(alias):
    try:
        # To get qid candidates of an alias
        cands = LoadEntityProfiles.esp.get_qid_cands(alias)
        print(f"Cands {cands}")
        print([es.get_title(qid) for qid in cands])
    except:
        pass

In [4]:
# Manual User Inputs
model_base = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/saved_results/09082020-bootleg_BASE_dev_rev_ent.csv'
model_alias = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/saved_results/09082020-bootleg_BASE_dev_rev_ent.csv'
model_train = '/dfs/scratch1/simran/bootleg_downstream/error_outputs/train_ent.csv'

In [5]:
LABEL_TO_ID = {'no_relation': 0, 'per:title': 1, 'org:top_members/employees': 2, 'per:employee_of': 3, 
               'org:alternate_names': 4, 'org:country_of_headquarters': 5, 'per:countries_of_residence': 6, 
               'org:city_of_headquarters': 7, 'per:cities_of_residence': 8, 'per:age': 9, 
               'per:stateorprovinces_of_residence': 10, 'per:origin': 11, 'org:subsidiaries': 12, 
               'org:parents': 13, 'per:spouse': 14, 'org:stateorprovince_of_headquarters': 15, 'per:children': 16, 
               'per:other_family': 17, 'per:alternate_names': 18, 'org:members': 19, 'per:siblings': 20, 
               'per:schools_attended': 21, 'per:parents': 22, 'per:date_of_death': 23, 'org:member_of': 24, 
               'org:founded_by': 25, 'org:website': 26, 'per:cause_of_death': 27, 
               'org:political/religious_affiliation': 28, 'org:founded': 29, 'per:city_of_death': 30, 
               'org:shareholders': 31, 'org:number_of_employees/members': 32, 'per:date_of_birth': 33, 
               'per:city_of_birth': 34, 'per:charges': 35, 'per:stateorprovince_of_death': 36, 'per:religion': 37, 
               'per:stateorprovince_of_birth': 38, 'per:country_of_birth': 39, 'org:dissolved': 40, 
               'per:country_of_death': 41}

LABEL_LST = list(LABEL_TO_ID.keys())
STANFORD_NER_TYPES = ['DATE', 'LOCATION', 'MONEY', 'ORGANIZATION', 'PERCENT', 'PERSON', 'TIME']

In [6]:
# Load the model data 
df_results_base = pd.read_csv(model_base)
df_errors_base = df_results_base[df_results_base['relation'] != df_results_base['prediction']]
print("FULL_model size: ", df_results_base.shape)
print("ERRS_model size: ", df_errors_base.shape)

df_results_var = pd.read_csv(model_alias)
df_errors_var = df_results_var[df_results_var['relation'] != df_results_var['prediction']]
print("FULL_model_var size: ", df_results_var.shape)
print("ERRS_model_var size: ", df_errors_var.shape)

FULL_model size:  (22631, 26)
ERRS_model size:  (2596, 26)
FULL_model_var size:  (22631, 26)
ERRS_model_var size:  (2596, 26)


In [35]:
df_results_var.columns.values

array(['obj', 'obj_mentions', 'subj_qids', 'prop_mentions',
       'real_mentions', 'subj_type', 'obj_type', 'id', 'subj_pos',
       'subj_leng', 'obj_qids', 'subj_mentions', 'qids', 'obj_leng',
       'obj_ner', 'stanford_ner', 'subj_ner', 'prediction', 'subj',
       'example', 'mentions', 'separation_dist', 'prop_ner', 'obj_pos',
       'relation', 'num_ner'], dtype=object)

# get the set difference in examples missed

In [7]:
# get the set difference of errors
missed_by_only_base_df = df_errors_base[~df_errors_base['id'].isin(df_errors_var['id'])]
missed_by_only_vari_df = df_errors_var[~df_errors_var['id'].isin(df_errors_base['id'])]
print("The #examples missed_by_only_base_df are: ", missed_by_only_base_df.shape[0])
print("The #examples missed_by_only_vari_df are: ", missed_by_only_vari_df.shape[0])

The #examples missed_by_only_base_df are:  0
The #examples missed_by_only_vari_df are:  0


# Utility functions

In [8]:
nomention = ["['UNK']", "['UNK', 'UNK']", "['UNK', 'UNK', 'UNK']", "['UNK', 'UNK', 'UNK', 'UNK']", 
             "['UNK', 'UNK', 'UNK', 'UNK', 'UNK']"]
def is_nomention(mention):
    return any(null for null in nomention if null in mention)

In [9]:
# text cleanup
def normalize_glove(tokens):
    mapping = {'-LRB-': '(',
                '-RRB-': ')',
                '-LSB-': '[',
                '-RSB-': ']',
                '-LCB-': '{',
                '-RCB-': '}'}
    for i in range(len(tokens)):
        if tokens[i] in mapping:
            #print(tokens)
            tokens[i] = mapping[tokens[i]]
    return tokens

# ACRONYMS COVERAGE

In [10]:
## For a single df of examples, which types of words are comonly missed?

### Missed Acronyms

In [11]:
# Approach 1: assume that orgs tagged by NER as orgs
dict_missed_acronyms = {}
missed_acronyms_lst = []
for index, row in df_results_base.iterrows():
    if is_nomention(row['subj_mentions']):
        subj = row['subj']
        subj = ast.literal_eval(subj)
        if len(subj) == 1 and 'ORGANIZATION' in row['subj_ner']: # assumes acronyms are one "word" and for orgs
            dict_missed_acronyms[subj[0]] = row['subj_mentions']
            missed_acronyms_lst.append(' '.join(subj))
        
    if is_nomention(row['obj_mentions']):
        obj = row['obj']
        obj = ast.literal_eval(obj)
        if len(obj) == 1 and 'ORGANIZATION' in row['obj_ner']:
            dict_missed_acronyms[obj[0]] = row['obj_mentions']
            missed_acronyms_lst.append(' '.join(obj))
        
missed_acronyms = Counter(missed_acronyms_lst).most_common(60)
print(missed_acronyms)

[('millipore', 194), ('wno', 20), ('inbs', 13), ('finmeccanica', 7), ('siic', 5), ('army', 5), ('state', 4), ('navy', 3), ('asms', 2), ('monforts', 2), ('ecota', 2), ('eastern', 2), ('times', 2), ('fignon', 2), ('roskosmos', 2), ('steelworkers', 2), ('pfox', 1), ('airlines', 1), ('green', 1), ('mofcom', 1), ('tassc', 1), ('justice', 1), ('euro15', 1), ('medicine', 1), ('ltd', 1), ('france-info', 1), ('foundation', 1), ('energy', 1), ('union-tribune', 1), ('kaczynskis', 1), ('federation', 1), ('yonhap', 1), ('council', 1), ('gpk', 1), ('journal', 1), ('attenti', 1), ('ucpn-m', 1), ('domingo-cafritz', 1), ('ncdc', 1), ('newcity.com', 1), ('aibi', 1), ('izvestiya', 1), ('eemt', 1), ('hhwi', 1), ('bfnt', 1), ('university', 1), ('chinaco', 1), ('drcm', 1), ('who', 1)]


In [12]:
# Approach 2: assume that orgs are part of the subj and obj of org relations
org_relations = ['org:subsidiaries', 'org:parents', 'org:shareholders', 'org:dissolved', 
 'org:political/religious_affiliation', 'per:religion', 'no_relation']
dict_missed_acronyms = {}
missed_acronyms_lst = []
for index, row in df_results_var.iterrows():
    if is_nomention(row['subj_mentions']):
        subj = row['subj']
        subj = ast.literal_eval(subj)
        if len(subj) == 1 and row['relation'] in org_relations: # assumes acronyms are one "word" and for orgs
            dict_missed_acronyms[subj[0]] = row['subj_mentions']
            missed_acronyms_lst.append(subj[0])
        
    if is_nomention(row['obj_mentions']):
        obj = row['obj']
        obj = ast.literal_eval(obj)
        if len(obj) == 1 and row['relation'] in org_relations:
            dict_missed_acronyms[obj[0]] = row['obj_mentions']
            missed_acronyms_lst.append(obj[0])
        
#print(dict_missed_acronyms)
missed_acronyms = Counter(missed_acronyms_lst).most_common(60)
print(missed_acronyms)

[('he', 2597), ('his', 2500), ('her', 685), ('she', 571), ('him', 457), ('one', 216), ('two', 179), ('millipore', 178), ('tuesday', 110), ('wednesday', 97), ('monday', 95), ('fignon', 88), ('now', 78), ('three', 69), ('u.s.', 59), ('president', 56), ('2003', 55), ('sunday', 52), ('wno', 48), ('five', 47), ('four', 46), ('saturday', 44), ('2005', 41), ('once', 38), ('2006', 35), ('2007', 33), ('2004', 31), ('recently', 31), ('2008', 30), ('cancer', 29), ('current', 28), ('spokesman', 25), ('today', 25), ('january', 24), ('december', 23), ('attack', 23), ('leader', 23), ('us', 23), ('2009', 23), ('future', 23), ('october', 21), ('inbs', 21), ('to', 21), ('100', 20), ('2001', 20), ('years', 20), ('30', 19), ('1983', 19), ('drugs', 19), ('may', 18), ('11', 18), ('1997', 16), ('director', 16), ('november', 16), ('currently', 16), ('north', 16), ('10', 16), ('million', 15), ('america', 15), ('six', 15)]


In [70]:
import json
base_data = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/bootleg_09132020/manual_alias_v2/'

train_file = "{}/train_ent.json".format(base_data)
with open(train_file) as train:
    df_train = json.load(train)
    df_train = pd.DataFrame.from_dict(df_train, orient='columns')
    print(df_train.shape)
    
dev_file = "{}/dev_rev_ent.json".format(base_data)
with open(dev_file) as dev:
    df_dev = json.load(dev)
    df_dev = pd.DataFrame.from_dict(df_dev, orient='columns')
    print(df_dev.shape)
    
test_file = "{}/test_rev_ent.json".format(base_data)
with open(test_file) as test:
    df_test = json.load(test)
    df_test = pd.DataFrame.from_dict(df_test, orient='columns')
    print(df_test.shape)

(68124, 17)
(22631, 17)
(15509, 17)


In [81]:
print(df_train.columns.values)

['id' 'docid' 'relation' 'token' 'subj_start' 'subj_end' 'obj_start'
 'obj_end' 'subj_type' 'obj_type' 'stanford_pos' 'stanford_ner'
 'stanford_head' 'stanford_deprel' 'entity_emb_id' 'ent_id'
 'static_ent_emb_id']


In [118]:
alternate_names_df = df_dev[df_dev['relation'].str.contains('org:alternate_names')]
alternate_names_df.to_json('dev_alternatenames_all.json',orient='d')

ValueError: Invalid value 'row' for option 'orient'

In [124]:
alternate_names1 = {}

for index, row in df_test.iterrows():
    if row['relation'] in 'org:alternate_names':
        add = 0
        tokens = row['token']
        
        ss, se = row['subj_start'], row['subj_end']
        subj = tokens[ss:se+1]
        subj_qid = row['ent_id'][ss:se+1]
        subj_ner = row['stanford_ner'][ss:se+1]
        
        
        os, oe = row['obj_start'], row['obj_end']
        obj = tokens[os:oe+1]
        obj_qid = row['ent_id'][os: oe+1]
        obj_ner = row['stanford_ner'][os: oe+1]
        
        
        if len(subj) == 1:
            if subj_qid[0] != obj_qid[0]:
                add = 1
        elif len(obj) == 1:
            if subj_qid[0] != obj_qid[0]:
                add = 1
        
        if add == 1:
            alternate_names1[row['id']] = {'subj':subj, 'subj_qid':subj_qid, 'obj':obj, 'obj_qid':obj_qid, 'example':tokens}
            print(subj, subj_qid, obj, obj_qid)
            print(row['token'])
            print()
                    
print(len(alternate_names1.items()))


['ADF'] ['Q4746193'] ['Oracle', 'Application', 'Developer', 'Framework'] ['Q19900', 'UNK', 'UNK', 'UNK']
['4GL', 'Development', 'Tools', '-', 'PeopleTools', ',', 'Oracle', 'Application', 'Developer', 'Framework', '-LRB-', 'ADF', '-RRB-', ',', 'Jdeveloper', ',', 'Eclipse', 'or', 'similar', '.']

['Organization', 'of', 'Asia', '-', 'Pacific', 'News', 'Agencies'] ['UNK', 'UNK', 'Q1070940', 'Q1070940', 'Q1070940', 'Q192283', 'Q192283'] ['OANA'] ['Q19609765']
['Trend', ',', 'an', 'Azerbaijani', 'wire', 'service', ',', 'on', 'Thursday', 'became', 'a', 'full', 'member', 'of', 'the', 'Organization', 'of', 'Asia', '-', 'Pacific', 'News', 'Agencies', '-LRB-', 'OANA', '-RRB-', '.']

['ALICO'] ['Q20709176'] ['American', 'Life', 'Insurance', 'Co.'] ['Q1044835', 'Q1044835', 'Q1068361', 'UNK']
['Known', 'as', 'ALICO', ',', 'or', 'the', 'American', 'Life', 'Insurance', 'Co.', ',', 'the', 'unit', 'does', 'its', 'conventional', 'insurance', 'business', 'overseas', 'in', 'more', 'than', '40', 'countries'

['ALICO'] ['Q20709176'] ['American', 'Life', 'Insurance', 'Co'] ['Q1044835', 'Q1044835', 'Q6042385', 'UNK']
['The', 'boards', 'of', 'both', 'companies', 'met', 'Sunday', 'and', 'approved', 'the', 'sale', 'of', 'the', 'AIG', 'unit', ',', 'the', 'American', 'Life', 'Insurance', 'Co', ',', 'known', 'as', 'ALICO', ',', 'the', 'people', 'briefed', 'on', 'the', 'matter', 'said', '.']

['ALICO'] ['Q20709176'] ['American', 'Life', 'Insurance'] ['Q1044835', 'Q1044835', 'Q6042385']
['At', 'the', 'time', ',', 'AIG', 'said', 'it', 'would', 'place', 'American', 'Life', 'Insurance', ',', 'known', 'as', 'ALICO', ',', 'and', 'American', 'International', 'Assurance', ',', 'known', 'as', 'AIA', 'Group', ',', 'into', 'SPVs', '.']

['ALICO'] ['Q4652374'] ['American', 'Life', 'Insurance', 'company'] ['Q607131', 'Q607131', 'Q607131', 'Q607131']
['US', 'life', 'insurance', 'giant', 'MetLife', 'said', 'on', 'Monday', 'it', 'will', 'acquire', 'American', 'International', 'Group', 'unit', 'American', 'Life', 'I

['ALICO'] ['Q20709176'] ['American', 'Life', 'Insurance', 'Company'] ['Q607131', 'Q607131', 'Q607131', 'Q607131']
['In', 'the', 'sale', 'of', 'unit', 'American', 'Life', 'Insurance', 'Company', '-LRB-', 'ALICO', '-RRB-', 'to', 'MetLife', ',', 'Inc', ',', 'AIG', 'raised', 'about', '162', 'billion', 'dollars', ',', 'including', 'about', '72', 'billion', 'dollars', 'in', 'cash', '.']

['Progressive', 'Democrats', 'of', 'America'] ['Q4379875', 'Q4379875', 'Q4379875', 'Q4379875'] ['PDA'] ['Q7136792']
['PDA', ',', 'The', 'Progressive', 'Democrats', 'of', 'America', ',', 'works', 'creatively', 'inside', 'and', 'outside', 'the', 'Democratic', 'Party', '.']

['Alico'] ['Q20709176'] ['American', 'Life', 'Insurance', 'Co.'] ['Q1044835', 'Q1044835', 'Q1068361', 'UNK']
['The', 'troubled', 'insurance', 'giant', ',', 'which', 'has', 'received', 'multiple', 'federal', 'bailouts', 'since', 'September', ',', 'said', 'that', 'it', 'would', 'give', 'the', 'New', 'York', 'Fed', 'preferred', 'stakes', 'in',

['ALICO'] ['Q20709176'] ['American', 'Life', 'Insurance', 'Company'] ['Q607131', 'Q607131', 'Q607131', 'Q607131']
['The', 'acquisition', 'of', 'American', 'Life', 'Insurance', 'Company', '-LRB-', 'ALICO', '-RRB-', 'will', 'cost', 'MetLife', 'some', '6.8', 'billion', 'dollars', 'in', 'cash', 'and', 'another', '8.7', 'billion', 'dollars', 'in', 'equity', 'securities', ',', 'MetLife', 'said', 'in', 'a', 'statement', '.']

['Central', 'American', 'Parliament'] ['Q190023', 'Q190023', 'Q190023'] ['PARLACEN'] ['UNK']
['Panama', 'has', 'submitted', 'a', 'letter', 'of', 'intention', 'to', 'quit', 'the', 'Central', 'American', 'Parliament', '-LRB-', 'PARLACEN', '-RRB-', ',', 'the', 'foreign', 'ministry', 'said', 'Wednesday', '.']

['Organization', 'of', 'Asia-Pacific', 'News', 'Agencies'] ['Q506194', 'Q506194', 'Q506194', 'Q506194', 'Q506194'] ['OANA'] ['Q42531328']
['by', 'Hai', 'Yang', ',', 'Nie', 'Yunpeng', 'Russia', 'and', 'China', 'have', 'been', 'actively', 'developing', 'cooperation', 'wi

['ALICO'] ['Q4652374'] ['American', 'Life', 'Insurance', 'Company'] ['Q607131', 'Q607131', 'Q607131', 'Q607131']
['ALICO', '-LRB-', 'American', 'Life', 'Insurance', 'Company', '-RRB-', '.']

['NRA'] ['Q863259'] ['National', 'Restaurant', 'Association'] ['Q6978094', 'Q6978094', 'Q6978094']
['Suspicions', 'had', 'already', 'fallen', 'on', 'Sheila', "O'Grady", ',', 'who', 'is', 'close', 'with', 'David', 'Axelrod', 'and', 'went', 'straight', 'from', 'being', 'former', 'Chicago', 'mayor', 'Richard', 'M.', 'Daley', "'s", 'chief', 'of', 'staff', 'to', 'president', 'of', 'the', 'Illinois', 'Restaurant', 'Association', '-LRB-', 'IRA', '-RRB-', ',', 'as', 'being', 'the', 'person', 'who', 'dug', 'up', 'Herman', 'Cain', "'s", 'personnel', 'records', 'from', 'the', 'National', 'Restaurant', 'Association', '-LRB-', 'NRA', '-RRB-', '.']

['China', 'Charity', 'Federation'] ['Q148', 'Q708676', 'UNK'] ['CCF'] ['Q15909843']
['The', 'China', 'Charity', 'Federation', '-LRB-', 'CCF', '-RRB-', 'launched', 'a

In [106]:
import json
with open('train_alternatenames.json', 'w') as f:
    for item in alternate_names1.items():
        json.dump(item, f)
        f.write('\n')
    

In [68]:
count = 0
for k, v in alternate_names1.items():
    if k not in alternate_names2:
        #print(k, v)
        #print()
        count += 1
        
print(count)

46


### Susccessful Acronyms by bootleg

In [30]:
dict_successful_acronyms = {}
for index, row in df_results.iterrows():
    subj = ast.literal_eval(row['subj'])
    subj_ment = ast.literal_eval(row['subj_mentions'])
    subj_ment = subj_ment[0].split(' ')
    if len(subj) == 1 and len(subj_ment) > 1: # assumes acronyms are one "word"
        if not is_nomention(row['subj_mentions']): 
            if 'ORGANIZATION' in row['subj_ner']: # restricts to organization acronyms
                dict_successful_acronyms[subj[0]] = row['subj_mentions']
        
    obj = ast.literal_eval(row['obj'])
    obj_ment = ast.literal_eval(row['obj_mentions'])
    obj_ment = obj_ment[0].split(' ')
    if len(obj) == 1 and len(obj_ment) > 1:
        if not is_nomention(row['obj_mentions']):
            if 'ORGANIZATION' in row['obj_ner']:
                dict_successful_acronyms[obj[0]] = row['obj_mentions']
    
print(dict_successful_acronyms)

{'adf': "['Amiga Disk File']", 'nra': "['National Rifle Association']", 'alico': "['Alico (company)']", 'psia': "['Professional Ski Instructors of America &amp; American Association of Snowboard Instructors']", 'countrywide': "['Bank of America Home Loans']", 'gop': "['Republican Party (United States)']", 'anadarko': "['Anadarko, Oklahoma']", 'unwto': "['World Tourism Organization']", 'u.n.': "['International sanctions']", 'coalition': "['Troops Out Now Coalition']", 'xinhua': "['Xinhua News Agency']", 'ndrc': "['National Development and Reform Commission']", 'pata': "['Pacific Asia Travel Association']", 'afp': "['Agence France-Presse']", 'sap': "['SAP SE']", 'fbi': "['Federal Bureau of Investigation']", 'rsl': "['Returned and Services League of Australia']", 'sycamore': "['Sycamore, Illinois']", 'islam': "['Islam and blasphemy']", 'erp': "['SAP ERP']", 'oana': "['Oana Avasilichioaei']", 'nyu': "['New York University']", 'ncai': "['Aitutaki Airport']", 'tpi': "['Screw thread']", 'usai

In [14]:
# post process a bit
delkeys = []
for k, v in dict_successful_acronyms.items():
    value = ast.literal_eval(v)
    value = value[0].lower()
    #print(k, value)
    if k in value:
        delkeys.append(k)

for key in delkeys:
    del dict_successful_acronyms[key]

dict_successful_acronyms

{'iarc': "['International Agency for Research on Cancer']",
 'pbf': "['Pakistan Boxing Federation']",
 'aaas': "['American Academy of Arts and Sciences']",
 'gm': "['General Motors']",
 'baa': "['Heathrow Airport Holdings']",
 'ecc': "['European Conference of Postal and Telecommunications Administrations']",
 'dcns': "['Naval Group']",
 'ihg': "['InterContinental Hotels Group']",
 'sasac': "['State-owned Assets Supervision and Administration Commission']",
 'sipri': "['Stockholm International Peace Research Institute']",
 'usw': "['United Steelworkers']",
 'bpc': '["Black People\'s Convention"]',
 'ap': "['Associated Press']",
 'cla': "['Conjugated linoleic acid']",
 'csp': "['Content Security Policy']",
 'sciri': "['Islamic Supreme Council of Iraq']",
 'soe': "['Special Operations Executive']",
 'wmo': "['World Meteorological Organization']",
 'iwmi': "['International Water Management Institute']",
 'plo': "['Palestine Liberation Organization']",
 'cbrc': "['China Banking Regulatory C

In [15]:
# post process a bit
delkeys = []
for k, v in dict_successful_acronyms.items():
    value = ast.literal_eval(v)
    value = value[0].lower()
    #print(k, value)
    if k in value:
        delkeys.append(k)

for key in delkeys:
    del dict_successful_acronyms[key]

dict_successful_acronyms
print(len(dict_successful_acronyms.keys()))

146


# Looking at pronoun mentions in subj/obj

In [39]:
# Approach 1: assume that orgs tagged by NER as orgs
dict_missed_acronyms = {}
missed_acronyms_lst = []
for index, row in df_errors_var.iterrows():
    if is_nomention(row['subj_mentions']):
        subj = row['subj']
        missed_acronyms_lst.append(subj)
        
    if is_nomention(row['obj_mentions']):
        obj = row['obj']
        missed_acronyms_lst.append(obj)
        
missed_acronyms = Counter(missed_acronyms_lst).most_common(60)
print(missed_acronyms)

[("['his']", 216), ("['he']", 205), ("['her']", 53), ("['she']", 38), ("['him']", 34), ("['election', 'complaints', 'commission']", 16), ("['2004']", 13), ("['murder']", 12), ("['president']", 11), ("['leader']", 11), ("['retailer']", 11), ("['fignon']", 9), ("['drugs']", 8), ("['wednesday']", 7), ("['actress']", 7), ("['spokesman']", 6), ("['southgobi', 'energy', 'resources']", 5), ("['reporter']", 5), ("['armaris']", 5), ("['minister']", 5), ("['star']", 4), ("['millipore']", 4), ("['jong-un']", 4), ("['cancer']", 4), ("['siic']", 4), ("['saturday']", 4), ("['oudeh']", 4), ("['sunday']", 4), ("['u.s.']", 4), ("['deputy']", 4), ("['america']", 4), ("['assault']", 4), ("['karrera', 'sarobe']", 4), ("['lawyer']", 4), ("['sportswriter']", 4), ("['writer']", 3), ("['85-day']", 3), ("['1996']", 3), ("['complications']", 3), ("['escape']", 3), ("['director']", 3), ("['monday']", 3), ("['us']", 3), ("['february']", 3), ("['illness']", 2), ("['2007']", 2), ("['1938']", 2), ("['37']", 2), ("['

# Besides just looking at subj/obj, what is missed as a mention OVERALL in the data?

In [160]:
def get_ner_groups(tokens, ner_tags, qids):
    
    groups_tok = []
    groups_qid = []
    groups_ner = []
    
    prev_is_named = 0
    prev_tag = "O"
    group_tok = []
    group_qid = []
    group_ner = []
    for i in range(len(ner_tags)):
        if is_named_entity(ner_tags[i]) and prev_tag == ner_tags[i]: # continue the group
            prev_is_named = 1
            group_tok.append(tokens[i])
            group_qid.append(qids[i])
            group_ner.append(ner_tags[i])
            
        elif is_named_entity(ner_tags[i]) and prev_is_named: # switch ner types
            groups_tok.append(group_tok)
            groups_qid.append(group_qid)
            groups_ner.append(group_ner)
            
            prev_is_named = 1
            prev_tag = ner_tags[i]
            group_tok = []
            group_qid = []
            group_ner = []
            
            prev_is_named = 1
            group_tok.append(tokens[i])
            group_qid.append(qids[i])
            group_ner.append(ner_tags[i])
            
        elif is_named_entity(ner_tags[i]): # new ner types
            prev_is_named = 1
            prev_tag = ner_tags[i]
            group_tok.append(tokens[i])
            group_qid.append(qids[i])
            group_ner.append(ner_tags[i])
            
        elif prev_is_named == 1: # end ner group
            groups_tok.append(group_tok)
            groups_qid.append(group_qid)
            groups_ner.append(group_ner)
            
            prev_is_named = 0
            prev_tag = "O"
            group_tok = []
            group_qid = []
            group_ner = []
            
        else: # nothing pass
            prev_is_named = 0
            prev_tag = "O"
            
    nomentions = []
    for i in range(len(groups_qid)):
        group_qid = groups_qid[i]
        group_string = ' '.join(groups_tok[i])
        group_ner = groups_ner[i]
        
        ner = group_ner[0]
        qid = group_qid[0]
        
        if qid == "UNK":
            nomentions.append(group_string)
            dict_of_ners_overall[ner].append(group_string)
            if group_string in dict_of_missed_mentions:
                dict_of_missed_mentions[group_string].append(' '.join(tokens))
            else:
                dict_of_missed_mentions[group_string] = [' '.join(tokens)]
                
            if "America" in group_string:
                print(tokens)
                print(ner_tags)
                print(qids)
                print("\n\n")
        
        
#     print(tokens)
#     print(ner_tags)
#     print(qids)
#     print(nomentions)
    return nomentions

In [161]:
def is_named_entity(ner):
    if any(elt for elt in STANFORD_NER_TYPES if elt == ner):
        return 1
    else:
        return 0

In [162]:
train_file = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/bootleg_09132020/basic_full_sentences/static_remap_embs/train_ent.json'
dev_file = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/bootleg_09132020/basic_full_sentences/static_remap_embs/dev_rev_ent.json'
test_file = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/bootleg_09132020/basic_full_sentences/static_remap_embs/test_rev_ent.json'
import json

In [163]:
missed_overall = []
dict_of_ners_overall = {}
dict_of_missed_mentions = {}

for elt in STANFORD_NER_TYPES:
    dict_of_ners_overall[elt] = []

with open(train_file) as infile:
    data_train = json.load(infile)
    print(len(data_train))

    for row in data_train:
        tokens = row['token']
        qids = row['ent_id']
        ner_tags = row['stanford_ner']
        nomentions = get_ner_groups(tokens, ner_tags, qids)
        for string in nomentions:
            missed_overall.append(string)
                
with open(dev_file) as infile:
    data_dev = json.load(infile)
    print(len(data_dev))

    for row in data_dev:
        tokens = row['token']
        qids = row['ent_id']
        ner_tags = row['stanford_ner']
        nomentions = get_ner_groups(tokens, ner_tags, qids)
        for string in nomentions:
            missed_overall.append(string)             
                
                
with open(test_file) as infile:
    data_test = json.load(infile)
    print(len(data_test))

    for row in data_test:
        tokens = row['token']
        qids = row['ent_id']
        ner_tags = row['stanford_ner']
        nomentions = get_ner_groups(tokens, ner_tags, qids)
        for string in nomentions:
            missed_overall.append(string)              
                
missed_overall = Counter(missed_overall).most_common(500)

#print(missed_overall)



68124
['Others', ',', 'such', 'as', 'a', '``', 'letter', 'from', 'the', 'future', "''", 'from', 'the', 'Christian', 'group', 'Focus', 'on', 'the', 'Family', ',', 'warn', 'of', 'an', 'America', 'in', '2012', 'where', '``', 'pornographic', 'magazines', 'are', 'openly', 'displayed', '.', "''"]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATE', 'DATE', 'O', 'O', 'O', 'MISC', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q106039', 'UNK', 'Q1435085', 'Q1435085', 'Q1435085', 'Q1435085', 'UNK', 'Q7422881', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q291', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['To', 'help', 'develop', 'proper', 'screening', 'procedures', ',', 'the', 'TSA', 'formed', 'a', 'disability', 'coalition', 'of', 'at', 'least', '70', 'organizations', 'nationally', ',', 'in

['A', 'group', 'of', 'Honduran', 'journalists', 'has', 'supported', 'the', 'Inter-American', 'Press', 'Association', "'s", '-LRB-', 'IAPA', '-RRB-', 'condemnation', 'of', 'attacks', 'against', 'independent', 'media', 'in', 'Honduras', '.']
['O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O']
['UNK', 'UNK', 'UNK', 'Q783', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q3402475', 'Q3402475', 'UNK', 'UNK', 'Q1626261', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q327591', 'UNK', 'UNK', 'Q783', 'UNK']



['For', 'example', ',', 'for', 'years', 'there', "'s", 'been', 'the', 'venerable', 'American', 'Bankers', 'Association', '-LRB-', 'ABA', '-RRB-', 'AND', 'the', 'more', 'homespun', 'America', "'s", 'Community', 'Bankers', '.']
['O', 'O', 'O', 'O', 'DURATION', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O

['You', 'either', 'go', 'from', 'Lima', 'to', 'Santiago', ',', 'Auckland', ',', 'Melbourne', '-LRB-', 'or', 'Sydney', '-RRB-', 'on', 'Lan', 'Chile', 'flights', ',', 'OR', 'via', 'B.A.', 'on', 'Aerolineas', 'Argentina', 'to', 'Sydney', ',', 'OR', 'via', 'America', 'which', 'takes', 'forever', '.']
['O', 'O', 'O', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'O', 'LOCATION', 'O', 'O', 'LOCATION', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'LOCATION', 'O', 'LOCATION', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'Q2868', 'UNK', 'Q2887', 'UNK', 'Q37100', 'UNK', 'Q3141', 'UNK', 'UNK', 'Q3130', 'UNK', 'UNK', 'Q649178', 'Q649178', 'UNK', 'UNK', 'UNK', 'UNK', 'Q8766', 'UNK', 'Q83535', 'Q83535', 'UNK', 'Q3130', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['``', 'When', 'Irene', 'Morgan', 'boarded', 'a', 'bus', 'for', 'Baltimore', 'in', 'the', 'summer', 'of', '1944', ',', "''", 'the', 'citation', 'read', ',', '`

['He', 'is', 'a', 'brutal', ',', 'murderous', 'dictator', ',', 'leading', 'an', 'oppressive', 'regime', '...', 'He', 'presents', 'a', 'particularly', 'grievous', 'threat', 'because', 'he', 'is', 'so', 'consistently', 'prone', 'to', 'miscalculation', '...', 'And', 'now', 'he', 'is', 'miscalculating', 'America', "'s", 'response', 'to', 'his', 'continued', 'deceit', 'and', 'his', 'consistent', 'grasp', 'for', 'weapons', 'of', 'mass', 'destruction', '...', 'So', 'the', 'threat', 'of', 'Saddam', 'Hussein', 'with', 'weapons', 'of', 'mass', 'destruction', 'is', 'real', '...', "''", '-', 'Sen.', 'John', 'F.', 'Kerry', '-LRB-', 'D', ',', 'MA', '-RRB-', ',', 'Jan.', '23', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATE', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O'

['Standings', ':', 'La', 'Equidad', '35', ';', 'Chico', '32', ';', 'Santa', 'Fe', '31', ';', 'Medellin', '29', ';', 'Envigado', '28', ';', 'Cali', '27', ';', 'America', '26', ';', 'Quindio', ',', 'Cucuta', ',', 'Bucaramanga', '25', ';', 'Millonarios', ',', 'Junior', '24', ';', 'Huila', '23', ';', 'Nacional', '22', ';', 'Once', 'Caldas', ',', 'Pasto', ',', 'Pereira', '19', ';', 'Tolima', '15', '.']
['O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'NUMBER', 'O', 'PERSON', 'NUMBER', 'O', 'ORGANIZATION', 'ORGANIZATION', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'MISC', 'NUMBER', 'O', 'LOCATION', 'NUMBER', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'ORGANIZATION', 'O', 'PERSON', 'NUMBER', 'O', 'LOCATION', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'DATE', 'O', 'O', 'O', 'O', 'PERSON', 'NUMBER', 'O', 'LOCATION', 'NUMBER', 'O']
['UNK', 'UNK', 'Q332668', 'Q332668', 'UNK', 'UNK', 'Q432152', 'UNK', 'UNK', 'Q1424072', 'Q1424072', 'UNK', '

['And', 'it', 'is', 'a', 'story', 'that', 'America', "'s", 'ambassador', 'to', 'Islamabad', 'Joseph', 'Farland', 'not', 'only', 'buys', 'but', 'tries', 'to', 'sell', 'his', 'president', 'in', 'turn', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'LOCATION', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q15726790', 'UNK', 'Q1362', 'Q6286743', 'Q6286743', 'UNK', 'UNK', 'Q503791', 'UNK', 'Q7841450', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['Standings', ':', 'Cucuta', '27', ';', 'Cali', ',', 'Nacional', ',', 'Chico', '23', ';', 'Huila', '22', ';', 'Bucaramanga', '20', ';', 'Santa', 'Fe', ',', 'Medellin', '19', ';', 'Junior', ',', 'Millonarios', '18', ';', 'Pasto', '16', ';', 'Tolima', ',', 'Caldas', '14', ';', 'Cartagena', ',', 'America', '13', ';', 'Quindio', '12', ';', 'Pereira', '9', ';', 'La', 'Equidad', '8', '.']
['O', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'LOCATION',

['The', 'Inter-American', 'Press', 'Association', 'statement', 'came', 'after', 'armed', 'pro-government', 'protesters', 'stormed', 'the', 'Caracas', 'headquarters', 'of', 'Globovision', ',', 'an', 'independent', 'cable', 'television', 'network', 'that', 'has', 'come', 'under', 'intense', 'legal', 'pressure', 'and', 'threats', 'of', 'closure', 'from', 'the', 'Chavez', 'government', '.']
['O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O']
['UNK', 'UNK', 'Q3402475', 'Q3402475', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q1533', 'UNK', 'UNK', 'Q5571087', 'UNK', 'UNK', 'Q79913', 'Q5276046', 'Q5276046', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q5135520', 'UNK', 'UNK', 'Q8440', 'UNK', 'UNK']



['These', 'knowledgeable', 'Americans', 'seem', 'to', 'b

['Inter-American', 'Press', 'Association', 'warned', 'against', '``', 'disturbing', 'signs', 'on', 'the', 'press', 'freedom', 'front', "''", 'in', 'the', 'United', 'States', ',', 'but', 'the', 'group', 'applauded', 'the', 'U.S.', 'Congress', 'for', 'taking', 'up', 'a', 'bill', 'that', 'would', 'give', 'federal', 'protection', 'to', 'reporters', 'asked', 'by', 'prosecutors', 'to', 'identify', 'confidential', 'sources', '.']
['ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'Q3402475', 'Q3402475', 'UNK', 'UNK', 'UNK', 'UNK', 'Q34228', 'UNK', 'UNK', 'Q22688', 'Q22688', 'UNK', 'UNK', 'UNK', 'UNK', 'Q30', 'Q30', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q11268', 'Q11268', 'UNK', 'UNK', 'UNK', 'UNK', 'Q686822', 'UNK', 'UNK', 'UNK', 'UNK', 'U

['In', 'other', 'first-leg', 'matches', ',', 'Sao', 'Paulo', 'edged', 'fellow', 'Brazilians', 'Gremio', '1-0', ',', 'Mexico', "'s", 'America', 'routed', 'Chile', "'s", 'Colo', 'Colo', '3-0', ',', 'and', 'Uruguay', "'s", 'Defensor', 'Sporting', 'drubbed', 'Brazil', "'s", 'Flamengo', '3-0', '.']
['O', 'O', 'O', 'O', 'O', 'LOCATION', 'LOCATION', 'O', 'O', 'MISC', 'ORGANIZATION', 'NUMBER', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'ORGANIZATION', 'ORGANIZATION', 'NUMBER', 'O', 'O', 'LOCATION', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'LOCATION', 'O', 'ORGANIZATION', 'NUMBER', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q38568', 'Q38568', 'UNK', 'UNK', 'Q83459', 'Q221695', 'UNK', 'UNK', 'Q164089', 'UNK', 'UNK', 'UNK', 'Q172025', 'UNK', 'Q207373', 'Q207373', 'UNK', 'UNK', 'UNK', 'Q134916', 'UNK', 'Q844238', 'Q844238', 'UNK', 'Q155', 'UNK', 'Q17479', 'UNK', 'UNK']



['Standings', ':', 'Pasto', ',', 'Nacional', ',', 'La', 'Equidad', ',', '10', ';', 'Caldas', ',', 'Tolima', '9', '

['Millonarios', '-LRB-', 'Colombia', '-RRB-', '2', ',', 'America', '-LRB-', 'Mexico', '-RRB-', '3']
['O', 'O', 'LOCATION', 'O', 'NUMBER', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'NUMBER']
['Q391984', 'UNK', 'Q739', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q96', 'UNK', 'UNK']



['Library', 'Journal', 'names', 'Chelsea', 'District', 'Library', 'the', 'Best', 'Small', 'Library', 'in', 'America', '-LRB-', 'Press', 'Release', 'on', 'Chelsea', 'District', 'Library', "'s", 'website', ',', 'dated', '1/24/08', '-RRB-', '-', 'Added', '1/28/08', 'by', 'Leslie', '2008-01-16T15', ':39:00', 'The', 'Chelsea', 'District', 'Library', 'has', 'been', 'awarded', 'the', 'Library', 'Journal', 'Best', 'Small', 'Library', 'in', 'America', 'award', '!']
['ORGANIZATION', 'ORGANIZATION', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'DATE', 'O'

['The', 'Inter-American', 'Press', 'Association', 'statement', 'came', 'after', 'armed', 'pro-government', 'protesters', 'stormed', 'the', 'Caracas', 'headquarters', 'of', 'Globovision', ',', 'an', 'independent', 'cable', 'television', 'network', 'that', 'has', 'come', 'under', 'intense', 'legal', 'pressure', 'and', 'threats', 'of', 'closure', 'from', 'the', 'Chavez', 'government', '.']
['O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O']
['UNK', 'UNK', 'Q3402475', 'Q3402475', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q1533', 'UNK', 'UNK', 'Q5571087', 'UNK', 'UNK', 'Q79913', 'Q5276046', 'Q5276046', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q5135520', 'UNK', 'UNK', 'Q8440', 'UNK', 'UNK']



['Bought', 'last', 'year', 'by', 'Utah-based', 'EnergySo

['Bought', 'last', 'year', 'by', 'Utah-based', 'EnergySolutions', ',', 'it', 'is', 'now', 'one', 'of', 'three', 'landfills', 'in', 'America', 'for', 'low-level', 'radioactive', 'waste', '.']
['O', 'DATE', 'DATE', 'O', 'MISC', 'ORGANIZATION', 'O', 'O', 'O', 'DATE', 'NUMBER', 'O', 'NUMBER', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q5376884', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q6484519', 'UNK', 'UNK', 'UNK', 'Q6692765', 'Q6692765', 'Q6692765', 'UNK']



['See', 'chapter', 'five', 'of', 'Ferrara', "'s", '``', 'America', "'s", 'Ticking', 'Bankruptcy', 'Bomb', '.', "''"]
['O', 'O', 'NUMBER', 'O', 'PERSON', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'Q13362', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q152074', 'UNK', 'UNK', 'UNK']



['Jane', 'Bolin', ',', 'who', 'was', 'the', 'first', 'black', 'woman', 'to', 'graduate', 'from', 'Yale', 'Law', 'School', 'and', 'became', 'America', "'s", 'first', 'black',

['EnergySolutions', ',', 'LLC', 'received', 'a', '$', '4.3', 'million', '-LRB-', 'euro3', 'million', '-RRB-', 'contract', ';', 'GE-Hitachi', 'Nuclear', 'Americas', ',', 'LLC', ',', 'received', 'a', '$', '4.8', 'million', '-LRB-', 'euro3', '.4', 'million', '-RRB-', 'contract', ';', 'and', 'General', 'Atomics', 'was', 'awarded', 'a', '$', '1.6', 'million', '-LRB-', 'euro1', '.1', 'million', '-RRB-', 'contract', '.']
['ORGANIZATION', 'O', 'ORGANIZATION', 'O', 'O', 'MONEY', 'MONEY', 'MONEY', 'O', 'NUMBER', 'NUMBER', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'MONEY', 'MONEY', 'MONEY', 'O', 'O', 'NUMBER', 'NUMBER', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'MONEY', 'MONEY', 'MONEY', 'O', 'O', 'NUMBER', 'NUMBER', 'O', 'O', 'O']
['Q5376884', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q932442', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q828', 'UNK', 'UNK', 'UNK', 'UNK', 'Q10817', 'Q10817'


['The', 'Inter-American', 'Press', 'Association', ',', 'or', 'IAPA', ',', 'one', 'of', 'the', 'region', "'s", 'largest', 'free', 'speech', 'groups', ',', 'kicks', 'off', 'its', 'semiannual', 'meeting', 'in', 'Caracas', 'on', 'Friday', ',', 'while', 'Chavez', 'backers', 'on', 'Thursday', 'began', 'a', '``', 'Latin', 'American', 'Meeting', 'on', 'Media', 'Terrorism', ',', "''", 'to', 'examine', 'what', 'they', 'call', 'slanted', 'coverage', 'of', 'his', 'government', '.']
['O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'ORGANIZATION', 'O', 'NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'SET', 'O', 'O', 'LOCATION', 'O', 'DATE', 'O', 'O', 'PERSON', 'O', 'O', 'DATE', 'O', 'O', 'O', 'MISC', 'MISC', 'MISC', 'MISC', 'MISC', 'MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'Q3402475', 'Q3402475', 'UNK', 'UNK', 'Q1626261', 'UNK', 'UNK', 'UNK', 'UNK', 'Q24460', 'UNK', 'UNK', 'Q22692', 'Q22692', 'UNK', 'UNK', 'UNK', 'UN

['Former', 'governors', 'Robert', 'McNair', '-LRB-', 'South', 'Carolina', '-RRB-', 'and', 'William', "O'Neill", 'and', 'Thomas', 'Meskill', '-LRB-', 'Connecticut', '-RRB-', 'were', 'remembered', 'by', 'their', 'constituents', ',', 'as', 'were', 'Jane', 'Bolin', ',', 'America', "'s", 'first', 'black', 'female', 'judge', ',', 'and', 'diplomats', 'Joseph', 'Farland', ',', 'Richard', 'Nolte', ',', 'and', 'Philip', 'Kaiser', '.']
['O', 'O', 'PERSON', 'PERSON', 'O', 'LOCATION', 'LOCATION', 'O', 'O', 'PERSON', 'PERSON', 'O', 'PERSON', 'PERSON', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'LOCATION', 'O', 'ORDINAL', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'PERSON', 'PERSON', 'O', 'O', 'PERSON', 'PERSON', 'O']
['UNK', 'Q878323', 'Q2157042', 'Q2157042', 'UNK', 'Q1456', 'Q1456', 'UNK', 'UNK', 'Q880800', 'Q880800', 'UNK', 'Q431804', 'Q431804', 'UNK', 'Q779', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q189760', 'UNK', 'UNK', 'UNK', 'Q4575186', 'Q45

['``', 'I', 'know', 'at', 'the', 'end', 'of', 'the', 'day', ',', 'putting', 'this', 'in', 'God', "'s", 'hands', ',', 'the', 'right', 'thing', 'for', 'America', 'will', 'be', 'done', 'at', 'the', 'end', 'of', 'the', 'day', 'on', 'Nov.', '4', ',', "''", 'the', 'GOP', 'vice', 'presidential', 'nominee', 'told', 'James', 'Dobson', ',', 'head', 'of', 'Focus', 'on', 'the', 'Family', ',', 'in', 'an', 'interview', 'posted', 'Wednesday', 'on', 'the', 'group', "'s", 'website', '.']
['O', 'O', 'O', 'O', 'DATE', 'DATE', 'DATE', 'DATE', 'DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'DATE', 'DATE', 'DATE', 'DATE', 'DATE', 'O', 'DATE', 'DATE', 'O', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'DATE', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q2192312', 

['Among', 'other', 'auctioned', 'items', 'was', 'a', 'photo', 'of', 'Sills', 'as', 'Queen', 'Elizabeth', 'that', 'graced', 'a', '1971', 'cover', 'of', 'Time', 'magazine', ',', 'which', 'called', 'her', '``', 'America', "'s", 'queen', 'of', 'opera', '.', "''"]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'DATE', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q235517', 'UNK', 'Q9682', 'Q9682', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q43297', 'Q43297', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q10633', 'UNK', 'UNK', 'UNK', 'UNK']



['FirstGroup', 'CEO', 'Moir', 'Lockhead', 'said', 'the', 'deal', '``', 'will', 'considerably', 'enhance', 'the', 'group', "'s", 'existing', 'activities', 'in', 'North', 'America', ',', "''", 'namely', 'a', '13', 'percent', 'stake', 'in', 'the', 'North', 'American', 'school', 'bus', 'market',

['UNK', 'UNK', 'Q616380', 'UNK', 'Q499616', 'UNK', 'UNK', 'Q51103', 'UNK', 'Q243766', 'UNK', 'Q332532', 'UNK', 'UNK', 'UNK', 'UNK', 'Q432152', 'UNK', 'Q48278', 'UNK', 'UNK', 'Q657461', 'UNK', 'Q51111', 'UNK', 'UNK', 'Q391984', 'UNK', 'Q1424072', 'Q1424072', 'UNK', 'Q332668', 'Q332668', 'UNK', 'UNK', 'UNK', 'Q320015', 'UNK', 'Q2933726', 'UNK', 'UNK', 'Q13995', 'UNK', 'Q234920', 'UNK', 'UNK']



['``', 'I', 'know', 'at', 'the', 'end', 'of', 'the', 'day', ',', 'putting', 'this', 'in', 'God', "'s", 'hands', ',', 'the', 'right', 'thing', 'for', 'America', 'will', 'be', 'done', 'at', 'the', 'end', 'of', 'the', 'day', 'on', 'Nov.', '4', ',', "''", 'the', 'GOP', 'vice', 'presidential', 'nominee', 'told', 'James', 'Dobson', ',', 'head', 'of', 'Focus', 'on', 'the', 'Family', ',', 'in', 'an', 'interview', 'posted', 'Wednesday', 'on', 'the', 'group', "'s", 'website', '.']
['O', 'O', 'O', 'O', 'DATE', 'DATE', 'DATE', 'DATE', 'DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCAT

['History', ':', '``', 'What', 'Hath', 'God', 'Wrought', ':', 'the', 'Transformation', 'of', 'America', ',', '1815-1848', ',', "''", 'by', 'Daniel', 'Walker', 'Howe', '-LRB-', 'Oxford', 'University', 'Press', '-RRB-']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'DURATION', 'O', 'O', 'O', 'PERSON', 'PERSON', 'PERSON', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q190', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q5219020', 'Q5219020', 'Q5219020', 'UNK', 'Q217595', 'Q217595', 'Q217595', 'UNK']



['Topping', 'off', 'a', 'weekend', 'of', 'hiking', ',', 'fishing', 'and', 'white-water', 'rafting', 'in', 'US', 'national', 'parks', ',', 'the', 'Obamas', 'strolled', 'along', 'the', 'top', 'of', 'the', 'deep', 'red', 'rock', 'gorge', 'in', 'Arizona', 'which', 'is', 'one', 'of', 'America', "'s", 'most', 'famous', 'natural', 'landmarks', '.']
['O', 'O', 'O', '

['Library', 'industry', 'publication', 'recognizes', 'McKune', '-LRB-', 'Ann', 'Arbor', 'New', ',', '1/27/08', '-RRB-', '-', 'Added', '1/28/08', 'by', 'Leslie', 'Chelsea', 'Library', 'wins', 'national', 'honor', '-LRB-', 'Ann', 'Arbor', 'News', ',', '1/24/08', '-RRB-', '-', 'Added', '1/24/08', 'by', 'Leslie', 'LIBRARY', 'JOURNAL', 'ARTICLE', 'Best', 'Small', 'Library', 'in', 'America', '2008', ':', 'Chelsea', 'District', 'Library', '-', 'A', 'Michigan', 'Model', '-LRB-', 'Library', 'Journal', ',', '2/1/08', '-RRB-', '-', 'Added', '1/28/08', 'by', 'Leslie', 'This', 'is', 'a', 'huge', 'honor', ',', 'and', 'a', 'very', 'prestigious', 'accomplishment', ',', 'with', 'a', '$', '15,000', 'prize', '.']
['O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'DATE', 'O', 'O', 'O', 'DATE', 'O', 'PERSON', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'DATE', 'O', 'O', 'O', 'DATE', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'DATE', 'O', 'ORGANIZATION', 'O

['Though', 'deeply', 'pessimistic', 'about', 'the', 'dangers', 'of', 'nuclear', 'confrontation', 'and', 'the', 'unbridgeable', 'gap', 'between', 'rich', 'nations', 'and', 'poor', ',', 'Rorty', 'retained', 'something', 'of', 'Dewey', "'s", 'hopefulness', 'about', 'America', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'LOCATION', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q922019', 'UNK', 'Q200535', 'UNK', 'UNK', 'UNK', 'UNK', 'Q215258', 'UNK', 'UNK', 'UNK', 'Q131805', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['``', 'He', "'s", 'still', 'in', 'danger', 'of', 'being', 'deported', ',', 'and', 'so', 'we', 'want', 'to', 'get', 'him', 'some', 'kind', 'of', 'legal', 'status', ',', "''", 'said', 'Rep.', 'Paul', 'E.', 'Gillmor', ',', 'R-Ohio', ',', 'of', 'his', 'bill', 'on', 'behalf', 'of', 'the', 'German', 'teen', ',', 'Manuel', 'Bartsch', ',', 

['``', 'When', 'Irene', 'Morgan', 'boarded', 'a', 'bus', 'for', 'Baltimore', 'in', 'the', 'summer', 'of', '1944', ',', "''", 'the', 'citation', 'read', ',', '``', 'she', 'took', 'the', 'first', 'step', 'on', 'a', 'journey', 'that', 'would', 'change', 'America', 'forever', '.', "''"]
['O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'DATE', 'DATE', 'DATE', 'DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORDINAL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O']
['UNK', 'UNK', 'Q273158', 'Q273158', 'UNK', 'UNK', 'UNK', 'UNK', 'Q5092', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q471395', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q61509', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['It', 'was', 'not', 'until', 'late', 'in', 'her', 'career', 'that', 'she', 'achieved', 'the', 'pinnacle', ',', 'appearing', 'at', 'New', 'York', "'s", 'Met', ',', 'America', "'s", 'premier', 'opera', 'house', '.']
['O',

['Among', 'his', 'moves', 'at', 'City', 'Opera', ',', 'Mortier', 'commissioned', 'two', 'of', 'America', "'s", 'pre-eminent', 'composers', ',', 'Philip', 'Glass', 'and', 'Charles', 'Wuorinen', ',', 'to', 'write', 'operas', 'for', 'it', '.']
['O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'PERSON', 'O', 'NUMBER', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q464662', 'Q14475832', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q189729', 'Q189729', 'UNK', 'Q961861', 'Q961861', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['Standings', ':', 'Nacional', '10', ';', 'Cucuta', ',', 'Medellin', '9', ';', 'Cali', '8', ';', 'Chico', '7', ';', 'Bucaramanga', '6', ';', 'Junior', ',', 'Tolima', '5', ';', 'Caldas', ',', 'America', ',', 'Pasto', '4', ';', 'Cartagena', ',', 'La', 'Equidad', ',', 'Quindio', '3', ';', 'Pereira', ',', 'Santa', 'Fe', '2', ';', 'Huil

['Former', 'governors', 'Robert', 'McNair', '-LRB-', 'South', 'Carolina', '-RRB-', 'and', 'William', "O'Neill", 'and', 'Thomas', 'Meskill', '-LRB-', 'Connecticut', '-RRB-', 'were', 'remembered', 'by', 'their', 'constituents', ',', 'as', 'were', 'Jane', 'Bolin', ',', 'America', "'s", 'first', 'black', 'female', 'judge', ',', 'and', 'diplomats', 'Joseph', 'Farland', ',', 'Richard', 'Nolte', ',', 'and', 'Philip', 'Kaiser', '.']
['O', 'O', 'PERSON', 'PERSON', 'O', 'LOCATION', 'LOCATION', 'O', 'O', 'PERSON', 'PERSON', 'O', 'PERSON', 'PERSON', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'LOCATION', 'O', 'ORDINAL', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'PERSON', 'PERSON', 'O', 'O', 'PERSON', 'PERSON', 'O']
['UNK', 'Q878323', 'Q2157042', 'Q2157042', 'UNK', 'Q1456', 'Q1456', 'UNK', 'UNK', 'Q880800', 'Q880800', 'UNK', 'Q431804', 'Q431804', 'UNK', 'Q779', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q189760', 'UNK', 'UNK', 'UNK', 'Q4575186', 'Q45

['Standings', ':', 'Cali', '15', ';', 'Cucuta', '14', ';', 'Nacional', ',', 'Bucaramanga', '12', ';', 'Medellin', ',', 'Santa', 'Fe', ',', 'Tolima', '11', ';', 'Chico', '10', ';', 'Junior', '9', ';', 'Caldas', '8', ',', 'Pasto', ',', 'Quindio', ',', 'Cartagena', ',', 'Huila', ',', 'America', '7', ';', 'Millonarios', '6', ';', 'La', 'Equidad', '4', ';', 'Pereira', '3', '.']
['O', 'O', 'O', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'ORGANIZATION', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'LOCATION', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'LOCATION', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'O', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'NUMBER', 'O', 'ORGANIZATION', 'NUMBER', 'O', 'ORGANIZATION', 'ORGANIZATION', 'NUMBER', 'O', 'PERSON', 'NUMBER', 'O']
['UNK', 'UNK', 'Q51103', 'UNK', 'UNK', 'Q616380', 'UNK', 'UNK', 'Q499616', 'UNK', 'Q243766', 'UNK', 'UNK', 'Q48278', 'UNK', 'Q1424072', 'Q1424072', '

22631
['Central', 'to', 'the', 'scheme', 'was', 'Raj', 'Rajaratnam', ',', 'the', 'founder', 'of', 'the', 'Galleon', 'Group', 'and', 'one', 'of', 'America', "'s", 'richest', 'men', ',', 'prosecutors', 'said', '.']
['O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'NUMBER', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q2068029', 'Q2068029', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q3094788', 'Q3094788', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['Georgia', 'Peanut', 'Commission', ',', 'http://wwwgapeanutscom', 'Federal', 'regulators', 'are', 'considering', 'a', 'snack', 'attack', 'on', 'America', "'s", 'airlines', 'that', 'would', 'restrict', 'or', 'even', 'completely', 'ban', 'serving', 'peanuts', 'on', 'commercial', 'flights', '.']
['ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O

['By', '2004', ',', 'he', 'was', 'a', 'senior', 'Al-Qaida', 'operative', 'and', 'became', 'Bin', 'Laden', "'s", 'top', 'propagandist', ',', 'appearing', 'in', 'internet', 'videos', 'calling', 'for', 'the', 'destruction', 'of', 'America', '.']
['O', 'DATE', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q34490', 'UNK', 'UNK', 'UNK', 'Q1317', 'Q1317', 'UNK', 'UNK', 'Q7281', 'UNK', 'UNK', 'UNK', 'Q75', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['These', 'so-called', '``', 'uprates', "''", 'have', 'increased', 'America', "'s", 'nuclear', 'capacity', 'by', 'almost', '5,000', 'MW', 'since', '1977', ',', 'the', 'equivalent', 'of', 'about', 'five', 'new', 'nuclear', 'reactors', ',', 'according', 'to', 'the', 'Nuclear', 'Energy', 'Institute', ',', 'an', 'industry', 'group', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC

['Legendary', 'choreographer', 'Merce', 'Cunningham', ',', 'one', 'of', 'the', 'towering', 'figures', 'of', 'modern', 'dance', ',', 'reached', 'the', 'age', 'of', '90', 'this', 'year', 'still', 'at', 'the', 'pinnacle', 'of', 'America', "'s", 'dance', 'scene', 'and', 'firmly', 'at', 'the', 'helm', 'of', 'his', 'company', '.']
['O', 'O', 'PERSON', 'PERSON', 'O', 'NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'NUMBER', 'DATE', 'DATE', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'Q180856', 'Q318364', 'Q318364', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q21016265', 'Q21016265', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q2309900', 'UNK', 'UNK', 'UNK', 'UNK']



['The', 'Brookings', 'Institution', "'s", 'Metropolitan', 'Policy', 'Program', ',', 'the', 'Regional', 'Plan', 'Association', "'s", 'Ameri

['Global', 'leaders', 'including', 'China', ',', 'Germany', 'and', 'Spain', '``', 'will', 'help', 'deliver', 'the', 'blueprint', ',', 'best', 'practices', 'and', 'master', 'plan', 'for', 'High', 'Speed', 'Rail', 'in', 'America', ',', "''", 'said', 'Thomas', 'A', 'Hart', ',', 'the', 'US', 'High', 'Speed', 'Rail', 'Association', "'s", 'vice', 'president', 'for', 'government', 'affairs', '.']
['O', 'O', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'LOCATION', 'O', 'O', 'O', 'PERSON', 'PERSON', 'PERSON', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'Q148', 'UNK', 'Q183', 'UNK', 'Q29', 'UNK', 'UNK', 'UNK', 'UNK', 'Q1088473', 'Q1088473', 'UNK', 'UNK', 'UNK', 'UNK', 'Q1907895', 'Q1907895', 'UNK', 'Q706198', 'Q706198', 'Q706198', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q8743'


['Three', 'years', 'ago', ',', 'the', 'BPC', 'and', 'Securing', 'America', "'s", 'Future', 'Energy', '-LRB-', 'SAFE', '-RRB-', 'staged', 'another', 'simulation', ',', '``', 'Oil', 'ShockWave', ',', "''", 'which', 'examined', 'US', 'dependence', 'on', 'foreign', 'oil', 'as', 'a', 'national', 'security', 'threat', '.']
['DATE', 'DATE', 'DATE', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'LOCATION', 'O', 'DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q4855360', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q45045', 'UNK', 'UNK', 'Q7081353', 'Q7081353', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q380409', 'UNK', 'UNK', 'Q22656', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['The', 'USW', 'and', 'four', 'other', 'steel', 'industry', 'organizations', ',', 'whose', 'workers', 'and', 'member', 'companies', 'represent', 'all', 'of', 'America', 

15509
['The', 'Palestine', '<', 'http://www.investigativeproject.org/docs/misc/25.pdf', '>', 'Committee', 'included', 'HLF', ',', 'the', 'Islamic', 'Association', 'for', 'Palestine', ',', 'a', 'think', 'tank', 'called', 'the', 'United', 'Association', 'for', 'Studies', 'and', 'Research', 'and', ',', 'later', ',', 'the', 'Council', 'on', 'American', 'Islamic', 'Relations', '-LRB-', 'CAIR', '-RRB-', '.']
['O', 'MISC', 'O', 'NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'ORGANIZATION', 'O', 'O']
['UNK', 'Q219060', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q63614385', 'UNK', 'UNK', 'Q6082221', 'Q6082221', 'Q6082221', 'Q6082221', 'UNK', 'UNK', 'Q155271', 'Q155271', 'UNK', 'UNK', 'Q24534585', 'Q24534585', 'Q24

['L.', 'Todd', 'Kelly', ',', 'one', 'of', 'the', 'lawyers', 'representing', 'Jones', ',', 'said', 'he', 'spent', '11', 'years', 'in', 'the', 'Marines', 'willing', 'to', 'die', 'for', 'the', 'freedom', 'in', 'America', '.']
['PERSON', 'PERSON', 'PERSON', 'O', 'NUMBER', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'DURATION', 'DURATION', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O']
['Q1503', 'Q3530258', 'Q3530258', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q314333', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q11218', 'Q53838052', 'Q590701', 'Q590701', 'Q590701', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']



['The', 'Palestine', '<', 'http://www.investigativeproject.org/docs/misc/25.pdf', '>', 'Committee', 'included', 'HLF', ',', 'the', 'Islamic', 'Association', 'for', 'Palestine', ',', 'a', 'think', 'tank', 'called', 'the', 'United', 'Association', 'for', 'Studies', 'and', 'Research', 'and', ',', 'later', ',', 'the', 'Council', 'on', 'American

['Would', 'you', 'question', 'Dr.', 'Sunder', "'s", 'competence', 'if', 'he', 'had', 'written', 'a', 'paper', '-LRB-', 'which', 'he', 'had', 'claimed', 'to', 'be', '``', 'scientific', "''", '-RRB-', 'about', 'Jesus', 'roaming', 'around', 'in', 'pre-colonial', 'America', '?']
['O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'LOCATION', 'O']
['UNK', 'UNK', 'UNK', 'UNK', 'Q7505945', 'UNK', 'Q1346205', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q302', 'Q680016', 'UNK', 'UNK', 'Q7167', 'UNK', 'UNK']



['9/11', '--', 'ITALIAN', 'SAYS', '9-11', 'SOLVED', '--', 'Staff', 'of', 'American', 'Free', 'Press', ',', 'Fri', 'Dec', '14', '23:28', 'ITALIAN', 'SAYS', '9-11', 'SOLVED', 'It', "'s", 'common', 'knowledge', ',', 'he', 'reveals', ',', 'CIA', ',', 'Mossad', 'behind', 'terror', 'attacks', 'By', 'the', 

['Betty', 'was', 'a', 'member', 'of', 'many', 'organizations', 'including', 'the', 'American', 'Association', 'of', 'University', 'Women', ',', 'the', 'Alpha', 'Delta', 'Kappa', 'International', 'Women', 'Educators', 'Sorority', ',', 'Retired', 'Teachers', 'of', 'America', 'and', 'the', 'First', 'Presbyterian', 'Church', '.']
['PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'ORGANIZATION', 'O']
['Q213122', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q463416', 'Q463416', 'Q463416', 'Q463416', 'Q463416', 'UNK', 'UNK', 'Q5225683', 'Q5225683', 'Q14401', 'UNK', 'UNK', 'UNK', 'Q3721', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q178169', 'Q178169', 'Q178169', 'UNK']



['-LRB-', 'The', 'State', 'of',

['The', 'remarks', 'from', 'Angelo', 'Mozilo', 'of', 'Countrywide', ',', 'America', "'s", 'largest', 'mortgage', 'lender', ',', 'and', 'Jeffrey', 'Mezger', 'of', 'KB', 'Home', ',', 'one', 'of', 'the', 'biggest', 'homebuilders', ',', 'came', 'during', 'a', 'panel', 'discussion', 'hosted', 'by', 'the', 'Milken', 'Institute', 'on', 'the', 'economic', 'impact', 'of', 'the', 'subprime', 'mortgage', 'meltdown', '.']
['O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'ORGANIZATION', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'PERSON', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['UNK', 'UNK', 'UNK', 'Q536229', 'Q536229', 'UNK', 'Q1137260', 'UNK', 'UNK', 'UNK', 'UNK', 'Q1210094', 'Q1210094', 'UNK', 'UNK', 'Q313489', 'Q16732537', 'UNK', 'Q6326486', 'Q6326486', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'Q2100278

In [129]:
for k, v in dict_of_ners_overall.items():
    dict_of_ners_overall[k] = set(v)
    print(k, len(set(v)))

print(dict_of_ners_overall)

DATE 3317
LOCATION 471
MONEY 2402
ORGANIZATION 1501
PERCENT 862
PERSON 862
TIME 673
{'DATE': {'1:52.09', 'July 2002', 'her 15th birthday', '1917', 'recent years the day', 'a decade later', 'May 3', '1958', 'the first three months of the year', '20th - century', '04-29-10', '28 August 2007', 'this coming weekend', '1248', 'the end of June', 'the early hours of July 12', 'Three months ago', '30-28', '1942 and 1944', 'the min', 'the whole year of 2009', 'January 28 , 1986', '1985 to 1991', 'the 29th of June', 'August 17', 'March 1986', 'August 11 , 2007', 'April 09 , 2008', 'a couple of years ago', 'the full fiscal year', 'the whole year of 2007', 'last Thursday', 'July 10', 'January 14-18', 'January 15 , 2009', 'October 2005', 'the final quarter of 2008', '1963', 'Wednesday , Jan 3 , 2007', 'more than three decades later', 'March 23', 'last month', 'September 11 , 1926', 'a century ago', '17.11.08', 'the quarter ended June 30', 'March 1994', '11/9/07', 'the previous week', 'mid-December'

In [144]:
for key in missed_overall:
    print("-\n")
    print(key[0])
    limit = 5
    for elt in dict_of_missed_mentions[key[0]]:
        print(elt)
        limit -= 1
        if limit == 0:
            break

print(dict_of_ners_overall)

-

Wednesday
If the strike ends soon , ABS president Steve McPherson has said he would prefer to leave Wednesday nights intact as the entire line-up of freshman shows is doing quite well in which case they may have to find a new slot for `` LOST . ''
NASA on Wednesday launched the Ares I-X rocket in Kennedy Space Center in Florida .
The year 2008 is expected to be one of slow economic growth for almost all major economies and Taiwan is no exception with an economic growth rate estimated at 4.23 percent , the head of the Taiwan Research Institute -LRB- TRI -RRB- said Wednesday .
`` Critics make the presumption that people switch , but people have their preferences , which is why companies are constantly trying different innovations on products like Coke Zero and Pepsi Max , '' Kevin Keane , a spokesman for the American Beverage Association , said Wednesday .
ANAHEIM , Calif. -- Five things to look for as the Angels and Red Sox begin their American League Division Series at Angel Stadium

July 1
Philip Chen , 51 , will be appointed chairman of John Swire & Sons -LRB- China -RRB- Ltd. on July 1 , but he will remain at Cathay Pacific as a nonexecutive deputy chairman , Swire said in a statement Thursday .
Philip Chen , 51 , will be appointed chairman of John Swire & Sons -LRB- China -RRB- Ltd. on July 1 , but he will remain at Cathay Pacific as a nonexecutive deputy chairman , Swire said in a statement Thursday .
Czech President Vaclav Klaus appointed Czech National Bank -LRB- CNB -RRB- vice-governor Miroslav Singer on Friday as the central bank governor from July 1 .
Czech President Vaclav Klaus appointed Czech National Bank -LRB- CNB -RRB- vice-governor Miroslav Singer on Friday as the central bank governor from July 1 .
Philip Chen , 51 , will be appointed chairman of John Swire & Sons -LRB- China -RRB- Ltd on July 1 , but he will remain at Cathay Pacific as a nonexecutive deputy chairman , Swire said in a statement Thursday .
-

Recently
Recently the Supreme Court rul

# EXTRACT SUBJ/OBJ THAT BOOTLEG DOESN'T FIND MENTIONS FOR  (e.g., 'america', 'american')

In [None]:
no_mention = []
for index, row in df_errors.iterrows():
    if any(null for null in nomention if null in row['subj_mentions']): 
        subj = row['subj']
        subj = ast.literal_eval(subj)
        subj_str = ' '.join(subj)
        if row['subj_ner'] != "['O']":
            no_mention.append(subj_str)

    if any(null for null in nomention if null in row['obj_mentions']):
        obj = row['obj']
        obj = ast.literal_eval(obj)
        obj_str = ' '.join(obj)
        if row['obj_ner'] != "['O']":
            no_mention.append(obj_str)

# LOOK AT SUBJ/OBJ that FREQUENTLY RECEIVE NO BOOTLEG MENTION: 
counts = Counter(no_mention).most_common(200) 
no_mention_frequent = []
for tup in counts:
    no_mention_frequent.append(tup[0])
    
print("IN JUST THE ERROR EXAMPLES, THE SUBJ AND OBJ THAT FREQUENTLY RECEIVE NO BOOTLEG MENTION ARE: \n")
print(counts)

In [None]:
no_mention_dict = {}
for mention in set(no_mention_frequent):
    no_mention_dict[mention] = []
    
for index, row in df_results.iterrows():
    if any(null in row['subj_mentions'] for null in nomention):
        subj = row['subj']
        subj = ast.literal_eval(subj)
        subj_str = ' '.join(subj)
        if subj_str in no_mention_dict:
            no_mention_dict[subj_str].append(row['id'])
    if any(null in row['obj_mentions'] for null in nomention):
        obj = row['obj']
        obj = ast.literal_eval(obj)
        obj_str = ' '.join(obj)
        if obj_str in no_mention_dict:
            no_mention_dict[obj_str].append(row['id'])

print("IN THE FULL SET OF EXAMPLES, THE SUBJ AND OBJ THAT FREQUENTLY RECEIVE NO BOOTLEG MENTION ARE: \n")
print(no_mention_dict.keys())

### (@ Laurel) by changing the mention here, you can view examples where the mention is 'UNK' in Bootleg

In [None]:
MISSED_MENTION = 'america' # CHANGE THIS TO LOOK AT DIFFERENT MISSED SUBJ/OBJ @LAUREL
for index, row in df_errors_base.iterrows():
    idx = row['id']
    subj = row['subj']
    obj = row['obj']
    if idx in no_mention_dict[MISSED_MENTION] or idx in no_mention_dict[MISSED_MENTION]: 
        print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction', 'mentions']])
        print()