In [2]:
import json
import os
from tqdm import tqdm
from multiprocessing.pool import Pool

In [2]:
train_data_dir = "/harddisk/data/nlp_data/kb/wikipedia/20220620/enwiki-20220620/output/"
sup_data_dir = "/harddisk/data/nlp_data/kb/wikipedia/20230301/enwiki-20230301"

In [15]:
with open(os.path.join(train_data_dir, "redirect.json")) as f:
    train_redirect_map = json.load(f)
with open(os.path.join(sup_data_dir, "redirect.json")) as f:
    sup_redicrect_map = json.load(f)

In [3]:
with open(os.path.join(train_data_dir, "title2id.json")) as f:
    train_title_map = json.load(f)
with open(os.path.join(sup_data_dir, "title2id.json")) as f:
    sup_title_map = json.load(f)

In [4]:
len(train_title_map), len(sup_title_map)

(16646038, 17063901)

In [17]:
train_title_redirect = set(train_redirect_map[title] if title in train_redirect_map else title for title in train_title_map.keys())
sup_title_redirect = set(sup_redicrect_map[title] if title in sup_redicrect_map else title for title in sup_title_map.keys())

In [18]:
len(train_title_redirect), len(sup_title_redirect)

(6545405, 6655001)

In [19]:
sub_titles = sup_title_redirect.difference(train_title_redirect)

In [20]:
print(len(sub_titles))
sub_titles = set(title for title in sub_titles if not title.startswith("List of"))
print(len(sub_titles))

180539
176504


In [24]:
sup_article_path = "/harddisk/data/nlp_data/kb/wikipedia/20230301/enwiki-20230301/blocks.ann"
files = [os.path.join(sup_article_path, file_path) for file_path in os.listdir(sup_article_path)]

In [25]:
def init():
    global sup_redicrect_map

def run(file):
    output_data_dir = "/harddisk/data/nlp_data/kb/wikipedia/subset"
    with open(file) as f:
        with open(os.path.join(output_data_dir, file.split("/")[-1].replace(".ann", "_filtered.ann")), "w") as fo:
            line = f.readline()
            while line:
                data = json.loads(line)
                title = data['title']
                if title in sup_redicrect_map:
                    title = sup_redicrect_map[title]
                if title in sub_titles:
                    fo.write(line)
                line = f.readline()


print("Preprocessing files...")
with Pool(64, initializer=init) as pool:
    for output in tqdm(pool.imap_unordered(run, files)):
        pass

Preprocessing files...


2272it [07:35,  4.99it/s]


# Downstream processing

In [1]:
from pymongo import MongoClient
from tqdm import tqdm
import json
import os
from multiprocessing.pool import Pool

In [3]:
mongodb_config = {"host": '9.109.142.31', "port": 27017}
dbname='wikidata-20230301'
new_client = MongoClient(**mongodb_config)
new_kg_collection = new_client[dbname]['kg']
new_raw_collection = new_client[dbname]['raw']

In [4]:
mongodb_config = {"host": '10.12.192.31', "port": 27017}
dbname="wikidata"
old_client = MongoClient(**mongodb_config)
old_kg_collection = old_client[dbname]['kg']
old_raw_collection = old_client[dbname]['raw']

In [6]:
data_file = "/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug_eval/corpus"
data = []
with open(data_file) as f:
    line = f.readline()
    while line:
        data.append(json.loads(line))
        line = f.readline()

In [7]:
new_mapping = json.load(open("/harddisk/data/nlp_data/kb/wikidata/20230301/mapping/sitelinks.enwiki.title.json"))
new_inverse_mapping = {value: key for key, value in new_mapping.items()}

In [8]:
old_mapping = json.load(open("/harddisk/data/nlp_data/kb/wikidata/20210520/mapping/qid2sitelinks.enwiki.title.json"))

In [9]:
new_ents = set(new_mapping.keys()).difference(set(old_mapping.keys()))

In [10]:
new_ent_strict = []
for new_ent in tqdm(new_ents):
    if not old_raw_collection.find_one({"id": new_ent}):
        new_ent_strict.append(new_ent)

100%|██████████| 698673/698673 [05:29<00:00, 2122.37it/s]


In [14]:
len(new_ents), len(new_ent_strict)

(698673, 496627)

In [15]:
with open("/harddisk/user/keminglu/evaluation_corpus/wiki_eval/new_entity_qid.json", "w") as f:
    json.dump(new_ent_strict, f)

In [16]:
def run(record):
    targets = json.loads(record['targets'])
    for i, ent in enumerate(targets['entities']):
        ent_type = 'in'
        if ent['title'] in new_inverse_mapping:
            qid = new_inverse_mapping[ent['title']]
            if qid in new_ent_strict:
                info = new_raw_collection.find_one({"id": qid})
                if len(info['descriptions']) > 0:
                    ent_type = 'ood_m'
                else:
                    ent_type = 'ood'
        targets['entities'][i]['ood'] = ent_type
    record['targets'] = json.dumps(targets)
    return record

In [17]:
def init():
    global new_inverse_mapping
    global new_ent_strict
    global new_raw_collection

pbar = tqdm(total=len(data))
print("Preprocessing files...")
with Pool(64, initializer=init) as pool:
    with open(os.path.join("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug_eval", "corpus_w_oom"), "w") as f:
        for output in pool.imap_unordered(run, data):
            f.write(json.dumps(output) + "\n")
            pbar.update(1)

  0%|          | 0/319298 [00:00<?, ?it/s]

Preprocessing files...


100%|█████████▉| 319282/319298 [30:58<00:00, 160.49it/s]

In [18]:
with open("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug_eval/corpus_w_oom") as f:
    data = [json.loads(line) for line in f.readlines()]

100%|██████████| 319298/319298 [31:15<00:00, 160.49it/s]

In [20]:
oom_data = []
for record in data:
    entities = json.loads(record["targets"])["entities"] 
    ood_samples = [ent for ent in entities if ent["ood"] != "in"]
    if len(ood_samples) > 0:
        oom_data.append(record)

In [21]:
dev_size = len(oom_data) // 10
dev_oom_data = oom_data[:dev_size]
test_oom_data = oom_data[dev_size:]
print(len(dev_oom_data), len(test_oom_data))

2710 24393


In [None]:
with open("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug_eval/corpus_filtered_test", "w") as f:
    for record in test_oom_data:
        f.write(json.dumps(record) + "\n")
with open("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug_eval/corpus_filtered_dev", "w") as f:
    for record in dev_oom_data:
        f.write(json.dumps(record) + "\n")

# Mark OOO types and relations

In [2]:
import json

In [5]:
train_types = json.load(open("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug/train_types.json"))
train_relations = json.load(open("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug/train_relations.json"))

In [26]:
with open("/harddisk/user/keminglu/pretrained_data_processed/wikipedia_with_mention_wo_title_simplified_aug_eval/corpus_filtered_dev_prompt_rephrased") as f:
    data = [json.loads(line) for line in f.readlines()]

In [23]:
data = [sample for sample in data if sample['aug_type'] == 'aug_default']

In [27]:
all_types = sum([each['type'] if 'type' in each else [] for sample in data for each in json.loads(sample['targets'])['entities']], [])

In [28]:
len(set(all_types).difference(set(train_types)))

109