In [1]:
import sys
sys.path.append("../")
from inference_server.utils import Trie

import pickle
import json
from transformers import AutoTokenizer
from tqdm import tqdm

In [2]:
bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

In [2]:
bloom_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")

In [4]:
bloom_tokenizer.tokenize(', "description": "none",')

[',', 'Ġ"description', '":', 'Ġ"n', 'one"', ',']

In [4]:
trie_path = "/harddisk/user/keminglu/evaluation_corpus/resources/kilt_titles_trie_dict.pkl"

In [5]:
trie_dict = pickle.load(open(trie_path, "rb"))

In [6]:
trie = Trie.load_from_dict(trie_dict)

In [7]:
terms = []
pbar = tqdm(total=len(trie))
for each in trie:
    tokens = bart_tokenizer.convert_ids_to_tokens(each)[1:-1]
    term = bart_tokenizer.convert_tokens_to_string(tokens)
    bloom_term = [243001] + bloom_tokenizer.convert_tokens_to_ids(bloom_tokenizer.tokenize(' "' + term + '",'))
    terms.append(bloom_term)
    pbar.update(1)

100%|█████████▉| 5903280/5903530 [08:40<00:00, 12139.50it/s]

In [9]:
bloom_tokenizer.convert_ids_to_tokens(terms[1024])

['Ġ"title":', 'Ġ"A', 'ch', 'ou', 'ffe', '"', ',']

100%|██████████| 5903530/5903530 [08:56<00:00, 12139.50it/s]

In [10]:
bloom_trie = Trie(terms)

building trie:: 100%|██████████| 5903530/5903530 [01:20<00:00, 73778.91it/s] 


In [17]:
bloom_trie.get([243001, 10101, 236440, 12136, 1512, 36291, 15])

[]

In [18]:
with open("/harddisk/user/keminglu/evaluation_corpus/resources/kilt_titles_trie_dict_bloom.pkl", "wb") as f:
    pickle.dump(bloom_trie.trie_dict, f)

In [19]:
trie_dict = pickle.load(open("/harddisk/user/keminglu/evaluation_corpus/resources/kilt_titles_trie_dict_bloom.pkl", "rb"))

In [20]:
trie = Trie.load_from_dict(trie_dict)

In [61]:
output = []
for type_name in ["organization", "person", "location", "miscellaneous"]:
    output.append(bloom_tokenizer.encode(f' "type": ["{type_name}", "placeholder", "placeholder"]}}', add_special_tokens=False)[0:-8])
    output.append(bloom_tokenizer.encode(f' "type": ["placeholder", "{type_name}", "placeholder"]}}', add_special_tokens=False)[4:-4])
    output.append(bloom_tokenizer.encode(f' "type": ["placeholder", "placeholder", "{type_name}"]}}', add_special_tokens=False)[8:-1])
    output.append(bloom_tokenizer.encode(f' "type": ["{type_name}"]}}', add_special_tokens=False)[0:-1])

type_trie = Trie(output)

building trie:: 100%|██████████| 16/16 [00:00<00:00, 34344.35it/s]


In [63]:
bloom_tokenizer.tokenize(" [")

['Ġ[']

: 

In [46]:
for each in type_trie:
    print(each)

[116220, 34955, 169753, 5, 15]
[116220, 34955, 169753, 9568]
[116220, 34955, 39667, 5, 15]
[116220, 34955, 39667, 9568]
[116220, 34955, 17394, 5, 15]
[116220, 34955, 17394, 9568]
[116220, 34955, 201154, 905, 67309, 5, 15]
[116220, 34955, 201154, 905, 67309, 9568]
[15, 567, 169753, 5, 15]
[15, 567, 169753, 9568]
[15, 567, 17394, 5, 15]
[15, 567, 17394, 9568]
[15, 19970, 6326, 5, 15]
[15, 19970, 6326, 9568]
[15, 13209, 12037, 905, 67309, 5, 15]
[15, 13209, 12037, 905, 67309, 9568]


In [47]:
15 in type_trie.trie_dict

True

In [44]:
with open("/harddisk/user/keminglu/evaluation_corpus/resources/basic_types_trie_dict_bloom.pkl", "wb") as f:
    pickle.dump(type_trie.trie_dict, f)

In [49]:
ufet_data = [json.loads(line) for line in open("/harddisk/user/keminglu/evaluation_corpus/processed_benchmarks/entity_typing/ufet_test.json").readlines()]

In [50]:
ufet_types = list(set(sum([sample['true'] for sample in ufet_data], [])))

In [52]:
len(ufet_types)

1682

In [53]:
output = []
for type_name in ufet_types:
    output.append(bloom_tokenizer.encode(f' "type": ["{type_name}", "placeholder", "placeholder"]}}', add_special_tokens=False)[0:-8])
    output.append(bloom_tokenizer.encode(f' "type": ["placeholder", "{type_name}", "placeholder"]}}', add_special_tokens=False)[4:-4])
    output.append(bloom_tokenizer.encode(f' "type": ["placeholder", "placeholder", "{type_name}"]}}', add_special_tokens=False)[8:-1])
    output.append(bloom_tokenizer.encode(f' "type": ["{type_name}"]}}', add_special_tokens=False)[0:-1])

type_trie = Trie(output)

building trie:: 100%|██████████| 6728/6728 [00:00<00:00, 398380.42it/s]


In [57]:
set([each[-1] for each in type_trie])

{15, 9568, 217017, 230106}

In [55]:
with open("/harddisk/user/keminglu/evaluation_corpus/resources/ufet_types_trie_dict_bloom.pkl", "wb") as f:
    pickle.dump(type_trie.trie_dict, f)

In [3]:
def generate_type_trie(types):
    output = []
    for type_name in types:
        output.append(bloom_tokenizer.encode(f' "type": ["{type_name}", "placeholder", "placeholder"]}}', add_special_tokens=False)[0:-8])
        output.append(bloom_tokenizer.encode(f' "type": ["placeholder", "{type_name}", "placeholder"]}}', add_special_tokens=False)[4:-4])
        output.append(bloom_tokenizer.encode(f' "type": ["placeholder", "placeholder", "{type_name}"]}}', add_special_tokens=False)[8:-1])
        output.append(bloom_tokenizer.encode(f' "type": ["{type_name}"]}}', add_special_tokens=False)[0:-1])
    type_trie = Trie(output)
    return type_trie

In [35]:
with open("/harddisk/user/keminglu/evaluation_corpus/processed_benchmarks/ner/crossner_science_gold.jsonl") as f:
    data = [json.loads(line) for line in f.readlines()]
    types = [[each[1] for each in sample['true']] for sample in data]
    types = list(set(sum(types, [])))

In [36]:
types

['organisation',
 'award',
 'event',
 'university',
 'academicjournal',
 'protein',
 'enzyme',
 'location',
 'chemicalcompound',
 'country',
 'chemicalelement',
 'discipline',
 'theory',
 'person',
 'scientist',
 'misc',
 'astronomicalobject']

In [39]:
mapping = {
    "organisation": "organization",
    "programlang": "programming language",
    "misc": "miscellaneous",
    "literarygenre": "literary genre",
    "musicalinstrument": "musical instrument",
    "musicalartist": "musical artist",
    "musicgenre": "music genre",
    "politicalparty": "political party",
    "academicjournal": "academic journal",
    "chemicalcompound": "chemical compound",
    "chemicalelement": "chemical element",
    "astronomicalobject": "astronomical object",
}
processed_types = [mapping[type] if type in mapping else type for type in types]

In [40]:
processed_types

['organization',
 'award',
 'event',
 'university',
 'academic journal',
 'protein',
 'enzyme',
 'location',
 'chemical compound',
 'country',
 'chemical element',
 'discipline',
 'theory',
 'person',
 'scientist',
 'miscellaneous',
 'astronomical object']

In [41]:
type_trie = generate_type_trie(processed_types)

building trie:: 100%|██████████| 68/68 [00:00<00:00, 111020.89it/s]


In [42]:
with open("/harddisk/user/keminglu/evaluation_corpus/resources/crossner_science_types_trie_dict_bloom.pkl", "wb") as f:
    pickle.dump(type_trie.trie_dict, f)