In [20]:
import numpy as np
from tqdm import tqdm
import os
import pickle
import re
from pathlib import Path
from collections import defaultdict
import ujson

Load alias map to filter.

In [82]:
from bootleg.symbols.type_symbols import TypeSymbols
from bootleg.symbols.entity_symbols import EntitySymbols
entity_dump = EntitySymbols(load_dir="/dfs/scratch0/lorr1/projects/bootleg-data/data/wiki_title_0122/entity_db/entity_mappings")
emb_dir = "/dfs/scratch0/lorr1/projects/bootleg-data/embs"
types_hy = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="hyena_vocab.json", type_file="hyena_types_1229.json")
types_coarse = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="hyena_coarse_vocab.json", type_file="hyena_types_coarse_1229.json")

Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_1229.json: 100%|██████████| 5832699/5832699 [00:16<00:00, 344451.56it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_coarse_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_coarse_1229.json: 100%|██████████| 5832699/5832699 [00:16<00:00, 349699.66it/s]


In [83]:
curr_aliases = entity_dump.get_alias2qids()

Load type mappings for adding back in countries

Load count files for all of wikipedia --- these were computed with `compute_statistics.py` (in utils/preprocessing) over the merged data file of test, dev, and train.

In [84]:
# number of times alias phrase occurs in the text across ALL of wikipedia
alias_text_counts = ujson.load(
    open('/dfs/scratch0/lorr1/projects/bootleg-data/data/all_wiki_title_0122/stats/alias_text_counts.json'))

# number of times alias occurs as an alias across ALL of wikipedia
alias_counts = ujson.load(
    open('/dfs/scratch0/lorr1/projects/bootleg-data/data/all_wiki_title_0122/stats/alias_counts.json'))

In [93]:
# Wikidata types to keep
regexes_of_types = [re.compile(p) for p in ["^<wordnet_person_100007846>$"]]

Simple function to find aliases to remove based on the count files above.

In [107]:
def get_norm_value(alias, verbose=False):
    if verbose:
        print('# times occurs as alias:', alias_counts.get(alias, 0))
        print('# times occurs in text:', alias_text_counts.get(alias, 0))
    return alias_counts.get(alias, 0) / (alias_text_counts[alias]) if alias in alias_text_counts else -1

def get_aliases_to_remove(curr_aliases, norm_threshold=0.017, min_seen=500, min_alias_count=10000):
    """
    Remove aliases which are frequent words but infrequent aliases due to rarity 
    or mislabel (e.g. band "themselves").
    """
    aliases_to_remove = set()
    cnts = defaultdict(int)
    grps = defaultdict(list)
    for alias in tqdm(curr_aliases):
        # If alias is not seen in Wikipedia
        if alias not in alias_counts:
            # If alias is seen in text but only a few times, skip as it's too few to make a decision
            if (alias in alias_text_counts and alias_text_counts[alias] < min_seen):
                continue
            # if alias occurs in Wikidata (so it's in our alias map), but not as alias in Wikipedia
            # and occurs more than min_seen times, only keep if one candidate (indicating a fairly unique alias)
            # and if that one candidate is a type we care about (e.g., people and locations)
            elif len(curr_aliases[alias]) == 1:
                continue
            # else make sure we don't think it's a person or location name - we want to keep those
            # even if more general alias
            else:
                # just use the first QID and first type to see if person or location
#                 qid = curr_aliases[alias][0][0]
#                 typs = types_coarse.get_types(qid)
#                 if len(typs) > 0 and any(r.search(typs[0]) for r in regexes_of_types):
#                     grps["kept_person_wikidata"].append(alias)
#                     continue
#                 else:
                cnts["not_in_wikipedia"] += 1
                grps["not_in_wikipedia"].append(alias)
                aliases_to_remove.add(alias)
                continue 
        # length greater than max_alias_len and weak labels cause some aliases to occur as aliases 
        # but not occur in the text
        if alias not in alias_text_counts:
            continue 
        # filter out aliases which occur commonly in the text but uncommonly as an alias
        # we require that the alias is a common phrase in text 
        # and that the phrase isn't very commonly an alias 
        if (get_norm_value(alias) < norm_threshold):
            if alias_text_counts[alias] > min_seen:
                if alias_counts[alias] < min_alias_count:
                    aliases_to_remove.add(alias)
                    cnts["removed_filter"] += 1
                    grps["removed_filter"].append(alias)
                else:
                    cnts["grt_min_alias_cnt"] += 1
                    grps["grt_min_alias_cnt"].append(alias)
            else:
                cnts["lt_min_seen"] += 1
                grps["lt_min_seen"].append(alias)
    
    return aliases_to_remove, cnts, grps

In [108]:
aliases_to_remove, cnts, grps = get_aliases_to_remove(curr_aliases)
print(len(aliases_to_remove))
print(ujson.dumps(cnts, indent=4))

100%|██████████| 15290555/15290555 [00:20<00:00, 750835.31it/s]

88058
{
    "removed_filter":50167,
    "grt_min_alias_cnt":5,
    "lt_min_seen":57584,
    "not_in_wikipedia":37891
}





Sanity checks on the filter step. 

In [109]:
# sample what aliases are getting removed
num_to_sample = 50
for alias in np.random.choice(list(aliases_to_remove), num_to_sample): 
    print(alias)

july 1918
arthur gardiner
shopper
pleasant valley portland oregon
race two
multiple unit
onboard
たつのり
you cant hurry love
gowharan rural district hormozgan province
her strange desire
prophet s
to hell
snood anatomy
the texas panhandler
1963 film
なかむら ゆり
the american revolutionary war
the orphans
against new zealand
womens synchronized trampoline
jana gana mana film
たかとみ
drug stores
saint patricks school
guillotine choke
list of linyphiidae species
ibn abi talib
lets turn back the years
locksmith comics
23
aaron in islam
canio
6400
マワタリ
sanshiwu
the time
16thcentury
pusillus
singin
5th battalion
detonated
sown
arkansas house
una vez mas leslie shaw song
translate
h2ac8
may 1932
duo onry ozzborn album
duties


In [110]:
# check for existence of certain words in aliases_to_remove
sanity_checks = [('themselves', True), 
                 ('dolittle', False),
                 ('us', False),
                 ('s', True),
                 ('is', True),
                 ('also', True),
                 ('in a world', True), 
                 ('of', True),
                 ('the', True),
                 ('by year', True),
                 ('apoptosis', False),
                 ('england', False)]
for s, bool_val in sanity_checks: 
    assert (s in aliases_to_remove) is bool_val, f'{s} {bool_val} {s in aliases_to_remove}'

In [115]:
# debug the norm values to set different thresholds
t = 'japan'
if t in aliases_to_remove:
    print("WILL REMOVE")
else:
    print("WILL KEEP")
print(curr_aliases.get(t))
print(types_coarse.get_types(curr_aliases[t][0][0]))
print(f"NORM", get_norm_value(t, verbose=True))
for k in grps:
    if t in grps[k]:
        print(k)
        break

WILL KEEP
[['Q17', 72356], ['Q161652', 5851], ['Q5287', 5438], ['Q188712', 5093], ['Q184963', 4675], ['Q476215', 4653], ['Q219712', 3382], ['Q205662', 2752], ['Q1146127', 2364], ['Q170566', 2231], ['Q848647', 869], ['Q388232', 747], ['Q696251', 578], ['Q731647', 575], ['Q850204', 571], ['Q1122433', 562], ['Q234138', 498], ['Q179103', 457], ['Q831454', 408], ['Q575453', 368], ['Q130436', 359], ['Q962145', 346], ['Q736311', 340], ['Q231425', 287], ['Q603399', 271], ['Q210688', 258], ['Q533312', 249], ['Q3658577', 231], ['Q579842', 197], ['Q841337', 180]]
['<yagoGeoEntity>']
# times occurs as alias: 76417
# times occurs in text: 558866
NORM 0.1367358186041019


In [105]:
grps["kept_person_wikidata"][:200]

['princeps',
 'timbaland thursday',
 '796',
 'francaise',
 '615',
 '842',
 '19871988',
 'notable works',
 '19861987',
 'music directors',
 'michel louis christophe roch gilbert motier marquis de la fayette',
 'as well as',
 'known in japan as',
 'predecessor',
 'alternatively',
 'pharaohs daughter',
 'caltex records releases',
 'television and film',
 'womens land army',
 'governor of the state of rhode island and providence plantations',
 'the cruel',
 'the reformer',
 'ov',
 '663',
 '569',
 'kim jongkun',
 '13 wins',
 'womens world championship',
 'fut',
 'previous club',
 'cien',
 '22 goals',
 'volodymyr tkachenko',
 'juushirou',
 'contributors',
 '477',
 '750',
 '594',
 'womens prison',
 'spotted',
 'maksim lepskiy',
 'sg wanna be',
 'official caucus site',
 '100s50s',
 'top score',
 'best bowling',
 'the bold',
 'volume 5',
 '399',
 '362',
 'pure opm classics',
 'the paul',
 'earl of calendar',
 'earl of calender',
 'general winstons daughter',
 'television episodes',
 'argent a l

Remove aliases and save new candidate mapping.

In [112]:
new_aliases = {}
for alias in list(curr_aliases): 
    if alias not in aliases_to_remove:
        new_aliases[alias] = curr_aliases[alias] 
print(len(new_aliases), "VS", len(curr_aliases))

15202497 VS 15290555


In [113]:
new_dir = '/dfs/scratch0/lorr1/projects/bootleg-data/data/wiki_title_0122/entity_db/entity_mappings'
# os.makedirs(new_dir, exist_ok=True)
new_alias_file = f'{new_dir}/alias2qids_wiki_filt.json'

with open(new_alias_file, 'w') as f: 
    ujson.save(new_aliases, f)