-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_relations.py
88 lines (77 loc) · 2.54 KB
/
get_relations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import argparse
import json
import random
import uuid
from tqdm import tqdm
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument('--title-path')
parser.add_argument('--triplet-path')
parser.add_argument('--vocab-path')
parser.add_argument('--out-dir', default='IndoLAMA')
random.seed(42)
LAMA_RELATIONS = [
'P19', 'P20', 'P279', 'P37', 'P413',
'P166', 'P449', 'P69', 'P47', 'P138',
'P364', 'P54', 'P463', 'P101', 'P1923',
'P106', 'P527', 'P102', 'P530', 'P176',
'P27', 'P407', 'P30', 'P178', 'P1376',
'P131', 'P1412', 'P108', 'P136', 'P17',
'P39', 'P264', 'P276', 'P937', 'P140',
'P1303', 'P127', 'P103', 'P190', 'P1001',
'P31', 'P495', 'P159', 'P36', 'P740', 'P361'
]
if __name__ == '__main__':
args = parser.parse_args()
print('| Fetching common vocabulary')
vocab = set()
with open(args.vocab_path, 'r') as f:
for line in f.readlines():
vocab.add(line.strip('\n'))
titles = {}
single_tokens = set()
with open(args.title_path, 'r') as f:
for row in f:
terms = row.split()
k = terms[0]
v = terms[1:]
titles[k] = ' '.join(v)
if titles[k].lower() in vocab:
single_tokens.add(k)
print('| Number of single token entities: {}'.format(len(single_tokens)))
valid_relations = {}
with open(args.triplet_path, 'r') as f:
for row in f:
head, relation, tail = row.split()
if tail in single_tokens:
if relation not in valid_relations:
valid_relations[relation] = []
valid_relations[relation].append((head, tail))
print('| Number of single token objects per LAMA relations')
for i in LAMA_RELATIONS:
print(' -) {} ({})'.format(i, len(valid_relations.get(i, []))))
for relation, entities in valid_relations.items():
if relation not in LAMA_RELATIONS:
continue
print('| Collecting valid relations for {}'.format(relation))
indolama = []
for ent in tqdm(entities):
head = ent[0]
tail = ent[1]
relation_dict = {
'uuid': str(uuid.uuid4()),
'sub_uri': head,
'obj_uri': tail,
'sub_label': titles[head],
'obj_label': titles[tail],
'predicate_id': relation
}
indolama.append(relation_dict)
indolama_sample = random.sample(indolama, k=min(1000, len(indolama)))
filename = '{}.jsonl'.format(relation)
filename = os.path.join(args.out_dir, filename)
Path(args.out_dir).mkdir(parents=True, exist_ok=True)
with open(filename, 'w+') as f:
for rel in indolama_sample:
f.write(json.dumps(rel) + '\n')