In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os, ujson
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
from IPython.core.display import display, HTML, Markdown
from bootleg.symbols.entity_symbols import EntitySymbols
from bootleg.symbols.type_symbols import TypeSymbols
from bootleg.symbols.kg_symbols import KGSymbols
def printmd(string):
    display(Markdown(string))
tqdm.pandas()
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
input_dir = Path('/dfs/scratch0/lorr1/projects/bootleg-data/data/korealiases_title_0122/')
entity_dump = EntitySymbols(load_dir=input_dir / "entity_db/entity_mappings")
a2q = entity_dump.get_alias2qids()
emb_dir = Path('/dfs/scratch0/lorr1/projects/bootleg-data/embs')
types_wd = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="wikidatatitle_to_typeid_1229.json", type_file="wikidata_types_1229.json")
# types_hy = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="hyena_vocab.json", type_file="hyena_types_1229.json")
# types_rel = TypeSymbols(entity_dump, emb_dir, max_types=50, type_vocab_file="relation_to_typeid_1229.json", type_file="kg_relation_types_1229.json")
# kg_syms = KGSymbols(entity_dump, emb_dir, "kg_adj_1229.txt")
q2title = ujson.load(open(input_dir / "entity_db/entity_mappings/qid2title.json"))
title2q = {v:k for k,v in q2title.items()}

Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/wikidata_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/wikidata_types_1229.json: 100%|██████████| 5832699/5832699 [00:14<00:00, 398419.52it/s]


In [30]:
save_dir = Path('/dfs/scratch0/lorr1/projects/bootleg/notebooks/guidability/saved_aug_metadata/')
save_file = save_dir / "korealiases_title_1229_football_aliases.json"
os.makedirs(save_dir, exist_ok=True)

In [31]:
def idxs_types_in_cands(all_cand_types, filter_keywords):
    indexes = set()
    for i, cand_types in enumerate(all_cand_types):
        for ty in cand_types:
            if any(k in ty for k in filter_keywords):
                indexes.add(i)
                break
    return indexes

def idxs_title_in_cands(all_cand_titles, filter_func):
    indexes = set()
    for i, cand_titl in enumerate(all_cand_titles):
        if filter_func(cand_titl):
            indexes.add(i)
    return indexes

def is_team_title(title):
    t = title.lower()
    in_list = ["national", "football", "team"]
    not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
    r = all(i in t for i in in_list) and all(i not in t for i in not_in_list)
    return r

In [35]:
type_add_in_cands = ["airport"]
type_mistake_in_cands = ["country", "city"]

d = {}
for al in tqdm(a2q):
    # Remove acronym aliases
    if len(al) <= 3:
        continue
    all_cand_types = [types_wd.get_types(p[0]) for p in a2q[al]]
    all_cand_titles = [q2title[p[0]] for p in a2q[al]]
    idxs_cands_with_ty_add = idxs_title_in_cands(all_cand_titles, is_team_title)
    # idxs_cands_with_ty_add = idxs_types_in_cands(all_cand_types, type_add_in_cands)
    idxs_cands_with_ty_mis = idxs_types_in_cands(all_cand_types, type_mistake_in_cands)
    if len(idxs_cands_with_ty_add) <= 0 or len(idxs_cands_with_ty_mis) <= 0:
        continue
    if len(idxs_cands_with_ty_add.intersection(idxs_cands_with_ty_mis)) <= 0:
        d[al] = []
        for j in idxs_cands_with_ty_mis:
            mistake_qid = a2q[al][j][0]
            for k in idxs_cands_with_ty_add:
                to_add_qid = a2q[al][k][0]
                res = {
                    "to_add": to_add_qid, "mistake": mistake_qid, "to_add_title": q2title[to_add_qid], "mistake_title": q2title[mistake_qid]
                }
                d[al].append(res)

100%|██████████| 2441248/2441248 [00:18<00:00, 131868.28it/s]


In [36]:
for k in d:
    print(k, d[k])

scotland [{'to_add': 'Q34044', 'mistake': 'Q22', 'to_add_title': 'Scotland national football team', 'mistake_title': 'Scotland'}, {'to_add': 'Q1812121', 'mistake': 'Q22', 'to_add_title': 'Scotland national under-21 football team', 'mistake_title': 'Scotland'}, {'to_add': 'Q917512', 'mistake': 'Q22', 'to_add_title': "Scotland women's national football team", 'mistake_title': 'Scotland'}, {'to_add': 'Q4140015', 'mistake': 'Q22', 'to_add_title': 'Scotland national under-19 football team', 'mistake_title': 'Scotland'}, {'to_add': 'Q4127960', 'mistake': 'Q22', 'to_add_title': 'Scotland national football B team', 'mistake_title': 'Scotland'}, {'to_add': 'Q34044', 'mistake': 'Q230791', 'to_add_title': 'Scotland national football team', 'mistake_title': 'Kingdom of Scotland'}, {'to_add': 'Q1812121', 'mistake': 'Q230791', 'to_add_title': 'Scotland national under-21 football team', 'mistake_title': 'Kingdom of Scotland'}, {'to_add': 'Q917512', 'mistake': 'Q230791', 'to_add_title': "Scotland wome

In [37]:
with open(save_file, "w") as out_f:
    ujson.save(d, out_f)