### Filters type list

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import json
import glob
import jsonlines
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
TYPE_FILT_STRENGTH = 100

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
pd.set_option('display.max_colwidth', -1)

In [5]:
# These are generated by Wikidata extractor
typetitle2typeid = json.load(open("/dfs/scratch0/lorr1/projects/bootleg/embs/wikidatatitle_to_typeid_1229_nofilt.json", "r"))
typetitle2typeqid = json.load(open("/dfs/scratch0/lorr1/projects/bootleg/embs/wikidatatitle_to_typeqid_1229_nofilt.json", "r"))
typeqid2typetitle = {v:k for k, v in typetitle2typeqid.items()}
typeid2typetitle = {v:k for k, v in typetitle2typeid.items()}
print(typeqid2typetitle.get("Q24258416"))
qid2types_nofilt = defaultdict(list)
max_types = -1
with open("/dfs/scratch0/lorr1/projects/bootleg/embs/wikidata_types_1229_nofilt.json", "r") as in_f:
    qid2types_nofilt = json.load(in_f)
    if max_types > 0:
        for qid in list(qid2types_nofilt.keys()):
            qid2types_nofilt[qid] = qid2types_nofilt[qid][:max_types]

entqid2title = json.load(open("/lfs/raiders8/0/lorr1/data/wiki_dump/alias_filtered_sentences/entity_db/entity_mappings/qid2title.json", "r"))
print(f"Number Types {len(typetitle2typeqid)} vs {len(typetitle2typeid)}. Number QIDs {len(entqid2title)}")

Number Types 178656 vs 178656. Number QIDs 5832699


In [8]:
train_files = glob.glob("/lfs/raiders8/0/lorr1/data/wiki_dump/alias_filtered_sentences/*.jsonl")

type_cnt = defaultdict(int)

for file in tqdm(train_files):
    with jsonlines.open(file, "r") as in_f:
        for doc in in_f:
            for line in doc["sentences"]:
                for qid in line["qids"]:
                    types = qid2types_nofilt.get(qid, [])
                    for typ in types:
                        type_cnt[typ] += 1

100%|██████████| 255/255 [16:54<00:00,  3.98s/it]


In [9]:
candidate_types = set()
for typ in type_cnt:
    if type_cnt[typ] > TYPE_FILT_STRENGTH:
        candidate_types.add(typ)

print(len(type_cnt), "VS", len(candidate_types))

69239 VS 23413


In [12]:
# Some checks
print(typetitle2typeid['season'] in candidate_types)
print(typetitle2typeid['calendar month'] in candidate_types)
print(typetitle2typeid['cocktail'] in candidate_types)

True
True
True


In [15]:

# to_add = set()
# for typeid in typeid2typename_wd:
#     qid_type = typeid2typename_wd[typeid]
#     typename = typename2title_wd.get(qid_type, qid_type)
#     if "season" in typename:
#         to_add.add(typeid)
#     if "month" in typename:
#         to_add.add(typeid)
#     if "alcohol" in typename:
#         to_add.add(typeid)
#     if "beverage" in typename:
#         to_add.add(typeid)
        
# print(len(to_add))
# candidate_types = candidate_types.union(to_add)

# compute coverage
total_qids = len(entqid2title)
no_qid = 0
no_qid_in_set = 0
qids_not_in_set = Counter()
qid_no_types = Counter()
for qid in tqdm(entqid2title):
    qid_types = set(qid2types_nofilt.get(qid, []))
    if len(qid_types) <= 0:
        no_qid += 1
        qid_no_types[qid] += 1
        continue
    if len(candidate_types.intersection(qid_types)) <= 0:
        no_qid_in_set += 1
        qids_not_in_set.update(qid_types)

100%|██████████| 5832699/5832699 [00:32<00:00, 180991.27it/s]


In [16]:
print(f"No mapping {no_qid} ({no_qid/total_qids}), Has type {total_qids-no_qid} ({(total_qids-no_qid)/total_qids}), No type in set {no_qid_in_set} ({no_qid_in_set/total_qids})")
print(f"Final coverage {(total_qids - no_qid - no_qid_in_set)/total_qids}")
print(f"Number of qids not in set {len(qids_not_in_set)}")
print(f"Sample of no Type QIDs {list(qid_no_types.keys())[:15]}")
print(qids_not_in_set.most_common(20))
print(typeid2typetitle[qids_not_in_set.most_common(20)[0][0]])
print(list(typeid2typetitle.keys())[:5])
print([typeid2typetitle[t[0]] for t in qids_not_in_set.most_common(20)])

No mapping 583765 (0.10008488351619035), Has type 5248934 (0.8999151164838096), No type in set 94723 (0.01623999455483645)
Final coverage 0.8836751219289732
Number of qids not in set 31002
Sample of no Type QIDs ['Q7316648', 'Q18348307', 'Q65954309', 'Q5462977', 'Q23879719', 'Q18206520', 'Q7234383', 'Q5185891', 'Q7662682', 'Q5358134', 'Q60790508', 'Q23017291', 'Q5603486', 'Q30674365', 'Q5020661']
[(6077, 1945), (310, 1214), (2305, 1052), (38377, 821), (4624, 652), (20136, 509), (7334, 442), (7989, 341), (2866, 300), (27222, 245), (27508, 231), (27554, 206), (17185, 195), (26189, 193), (21075, 190), (19332, 183), (27625, 176), (21234, 168), (20562, 166), (6687, 165)]
canton of France
[0, 1, 2, 3, 4]
['canton of France', 'nation at the World Championships in Athletics', 'khutor', 'Wiktionary redirect', 'canton of France (until 2015)', 'Wikimedia list of songs by performer', 'township of Nebraska', 'tennis qualification event', 'quarter/commune of Cambodia', 'Australian rules football tea

In [18]:
# Reindex types
oldtypeid2newtypeid = {}
typetitle2newtypeid = {}
typetitle2newtypeqid = {}
typeq2newtypeid = {}
i = 1
for old_typeid in sorted(candidate_types):
    oldtypeid2newtypeid[old_typeid] = i
    # Bootleg ID -> type wikidata QID
    type_title = typeid2typetitle[old_typeid]
    type_qid = typetitle2typeqid[type_title]
    if type_title in typetitle2newtypeid:
        print("SWAP", type_title)
        type_title += f"_{type_qid}"
        assert type_title not in typetitle2newtypeid
    assert type_qid not in typeq2newtypeid
    typetitle2newtypeid[type_title] = i
    typeq2newtypeid[type_qid] = i
    typetitle2newtypeqid[type_title] = type_qid
    i += 1
print(f"Remapping {i} types")

Remapping 23413 types


In [21]:
orig_file = "/dfs/scratch0/lorr1/projects/bootleg/embs/wikidata_types_1229_nofilt.json"
new_file = "/dfs/scratch0/lorr1/projects/bootleg/embs/wikidata_types_1229.json"
new_typetitle2id_file = "/dfs/scratch0/lorr1/projects/bootleg/embs/wikidatatitle_to_typeid_1229.json"
new_typetitle2qid_file = "/dfs/scratch0/lorr1/projects/bootleg/embs/wikidatatitle_to_typeqid_1229.json"

with open(new_typetitle2id_file, "w") as out_f:
    json.save(typetitle2newtypeid, out_f)

with open(new_typetitle2qid_file, "w") as out_f:
    json.save(typetitle2newtypeqid, out_f)

print(f"Saving {len(candidate_types)} fine types")
lines_out = 0
with open(new_file, "w") as out_f, open(orig_file, "r") as in_f:
    orig_qid2type = json.load(in_f)
    new_qid2type = {}
    for qid in tqdm(entqid2title.keys()):
        types = orig_qid2type.get(qid, [])
        types = sorted([oldtypeid2newtypeid[t] for t in list(candidate_types.intersection(types))])
        new_qid2type[qid] = types
        # types = map(str, types)
        # line = f"{qid},{'|'.join(types)}\n"
        # out_f.write(line)
        lines_out += 1
    json.save(new_qid2type, out_f)

print("DONE")

Saving 23413 fine types


100%|██████████| 5832699/5832699 [01:09<00:00, 83633.32it/s] 


DONE
