In [1]:
import numpy as np
from tqdm import tqdm
import os
import pickle
import re
from pathlib import Path
from collections import defaultdict
import ujson

Load alias map to filter.

In [13]:
from bootleg.symbols.entity_symbols import EntitySymbols
root_dir = Path("/dfs/scratch0/lorr1/projects/bootleg-data/data/squad_0310")
entity_dump = EntitySymbols.load_from_cache(load_dir=root_dir / "entity_db/entity_mappings")
wikidata_aliases = ujson.load(open("/dfs/scratch0/lorr1/projects/platelet-data/auxiliary/wikidata_aliases_1216.json"))

In [25]:
# wikidata or filter
DUMP_METHOD = "filter"

In [26]:
curr_aliases = entity_dump.get_alias2qids()

Load type mappings for adding back in countries

Load count files for all of wikipedia --- these were computed with `compute_statistics.py` (in utils/preprocessing) over the merged data file of test, dev, and train.

In [15]:
# number of times alias phrase occurs in the text across ALL of wikipedia
alias_text_counts = ujson.load(
    open(root_dir / 'stats/alias_text_counts.json'))

# number of times alias occurs as an alias across ALL of wikipedia
alias_counts = ujson.load(
    open(root_dir / 'stats/alias_counts.json'))

Simple function to find aliases to remove based on the count files above.

In [27]:
def get_norm_value(alias, verbose=False):
    if verbose:
        print('# times occurs as alias:', alias_counts.get(alias, 0))
        print('# times occurs in text:', alias_text_counts.get(alias, 0))
    return alias_counts.get(alias, 0) / (alias_text_counts[alias]) if alias in alias_text_counts else -1

def get_aliases_by_wikidata(curr_aliases, wikidata_aliases):
    """
    Remove aliases that are not a wikidata also known as alias.
    """
    aliases_to_remove = set()
    for alias in tqdm(curr_aliases):
        if alias not in wikidata_aliases:
            aliases_to_remove.add(alias)
    return aliases_to_remove

def get_aliases_to_remove(curr_aliases, norm_threshold=0.017, min_seen=500, min_alias_count=10000):
    """
    Remove aliases which are frequent words but infrequent aliases due to rarity 
    or mislabel (e.g. band "themselves").
    """
    aliases_to_remove = set()
    cnts = defaultdict(int)
    grps = defaultdict(list)
    for alias in tqdm(curr_aliases):
        # If alias is not seen in Wikipedia
        if alias not in alias_counts:
            # If alias is seen in text but only a few times, skip as it's too few to make a decision
            if (alias in alias_text_counts and alias_text_counts[alias] < min_seen):
                continue
            # if alias occurs in Wikidata (so it's in our alias map), but not as alias in Wikipedia
            # and occurs more than min_seen times, only keep if one candidate (indicating a fairly unique alias)
            # and if that one candidate is a type we care about (e.g., people and locations)
            elif len(curr_aliases[alias]) == 1:
                continue
            # else make sure we don't think it's a person or location name - we want to keep those
            # even if more general alias
            else:
                cnts["not_in_wikipedia"] += 1
                grps["not_in_wikipedia"].append(alias)
                aliases_to_remove.add(alias)
                continue 
        # length greater than max_alias_len and weak labels cause some aliases to occur as aliases 
        # but not occur in the text
        if alias not in alias_text_counts:
            continue 
        # filter out aliases which occur commonly in the text but uncommonly as an alias
        # we require that the alias is a common phrase in text 
        # and that the phrase isn't very commonly an alias 
        if (get_norm_value(alias) < norm_threshold):
            if alias_text_counts[alias] > min_seen:
                if alias_counts[alias] < min_alias_count:
                    aliases_to_remove.add(alias)
                    cnts["removed_filter"] += 1
                    grps["removed_filter"].append(alias)
                else:
                    cnts["grt_min_alias_cnt"] += 1
                    grps["grt_min_alias_cnt"].append(alias)
            else:
                cnts["lt_min_seen"] += 1
                grps["lt_min_seen"].append(alias)
    
    return aliases_to_remove, cnts, grps

In [28]:
if DUMP_METHOD == "filter":
    print("Using stats to filter")
    aliases_to_remove, cnts, grps = get_aliases_to_remove(curr_aliases)
    print(len(aliases_to_remove))
    print(ujson.dumps(cnts, indent=4))
else:
    print("Using Wikidata to filter")
    aliases_to_remove = get_aliases_by_wikidata(curr_aliases, wikidata_aliases)

print(f"Will remove {len(aliases_to_remove)} out of {len(curr_aliases)}")

  1%|          | 22893/4343182 [00:00<00:18, 228926.40it/s]

Using stats to filter


100%|██████████| 4343182/4343182 [00:04<00:00, 934085.91it/s] 

124330
{
    "removed_filter": 4326,
    "lt_min_seen": 6286,
    "not_in_wikipedia": 120004
}
Will remove 124330 out of 4343182





Sanity checks on the filter step. 

In [20]:
# sample what aliases are getting removed
num_to_sample = 50
for alias in np.random.choice(list(aliases_to_remove), num_to_sample): 
    print(alias)

цезиас мец
10 xronia mazi
rockfunk
ustajikistan relations
ak47su
body transistor
gare dolten
henry richardson cricketer born 1846
justice william o douglas
woollcott alexander
mtv swedish tv channel
sir michael atiyah
west los angeles ca
kfar sava
hunters island
french concession of shanghai
roadshow films
2015 yale bulldogs football
black sea region turkey
henry gilroy baseball
beckham putra nugraha
korbr
anne dormer lady hungerford
saint torpes of pisa
iso 639zir
ballets by marius petipa
marine cadets
pacific northwest bell telephone company
avid d weinberger
5 x 5 cube
draftzayn africa
daphne anne caruana galizia
national bishop for torres strait people
catchment water
notre dame fighting irish football 1985
lockheed hudson iva
englishborn
united statesman
gadaræ
trygve martin bratteli
jakobstadt
albanian national liberation front
sirkesh
cuisine of boston
still standing tv series
norske skogindustrier asa
klein charles
mpeg2 layer ii
poaching of white rhinoceroses
the henegar cente

In [110]:
# check for existence of certain words in aliases_to_remove
sanity_checks = [('themselves', True), 
                 ('dolittle', False),
                 ('us', False),
                 ('s', True),
                 ('is', True),
                 ('also', True),
                 ('in a world', True), 
                 ('of', True),
                 ('the', True),
                 ('by year', True),
                 ('apoptosis', False),
                 ('england', False)]
for s, bool_val in sanity_checks: 
    assert (s in aliases_to_remove) is bool_val, f'{s} {bool_val} {s in aliases_to_remove}'

In [115]:
# debug the norm values to set different thresholds
t = 'japan'
if t in aliases_to_remove:
    print("WILL REMOVE")
else:
    print("WILL KEEP")
print(curr_aliases.get(t))
print(types_coarse.get_types(curr_aliases[t][0][0]))
print(f"NORM", get_norm_value(t, verbose=True))
for k in grps:
    if t in grps[k]:
        print(k)
        break

WILL KEEP
[['Q17', 72356], ['Q161652', 5851], ['Q5287', 5438], ['Q188712', 5093], ['Q184963', 4675], ['Q476215', 4653], ['Q219712', 3382], ['Q205662', 2752], ['Q1146127', 2364], ['Q170566', 2231], ['Q848647', 869], ['Q388232', 747], ['Q696251', 578], ['Q731647', 575], ['Q850204', 571], ['Q1122433', 562], ['Q234138', 498], ['Q179103', 457], ['Q831454', 408], ['Q575453', 368], ['Q130436', 359], ['Q962145', 346], ['Q736311', 340], ['Q231425', 287], ['Q603399', 271], ['Q210688', 258], ['Q533312', 249], ['Q3658577', 231], ['Q579842', 197], ['Q841337', 180]]
['<yagoGeoEntity>']
# times occurs as alias: 76417
# times occurs in text: 558866
NORM 0.1367358186041019


Remove aliases and save new candidate mapping.

In [29]:
new_aliases = {}
for alias in list(curr_aliases): 
    if alias not in aliases_to_remove:
        new_aliases[alias] = curr_aliases[alias] 
new_alias_idx = {al:i for i, al in enumerate(new_aliases.keys())}
print(len(new_aliases), "VS", len(curr_aliases))

4218852 VS 4343182


In [30]:
new_dir = root_dir / 'entity_db/entity_mappings'
# os.makedirs(new_dir, exist_ok=True)
new_alias_file = new_dir / f'alias2qids_filt.json'
new_aliasidx_file = new_dir / f'alias2id_filt.json'

with open(new_alias_file, 'w') as f: 
    ujson.dump(new_aliases, f)
with open(new_aliasidx_file, 'w') as f: 
    ujson.dump(new_alias_idx, f)

print(f"Saved alias mapping at {new_alias_file} and id to {new_aliasidx_file}")

Saved alias mapping at /dfs/scratch0/lorr1/projects/bootleg-data/data/squad_0310/entity_db/entity_mappings/alias2qids_filt.json and id to /dfs/scratch0/lorr1/projects/bootleg-data/data/squad_0310/entity_db/entity_mappings/alias2id_filt.json
