In [1]:
import pickle
import torch
from transformers import AutoModel, AutoConfig, AutoTokenizer

In [2]:
model = AutoModel.from_pretrained('bert-base-cased')
config = AutoConfig.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [3]:
encoded = torch.as_tensor(tokenizer.encode("Hello", padding='max_length')).unsqueeze(0)
unpooled_output, pooled_output = model(input_ids=encoded)
print(encoded.shape, unpooled_output.shape, pooled_output.shape)

torch.Size([1, 512]) torch.Size([1, 512, 768]) torch.Size([1, 768])


In [9]:
tokenizer.pad_token_id

0

In [47]:
tokenizer.encode("{ }")

[101, 196, 198, 102]

In [51]:
with open('../data/cfq/tok-vocab.pickle', 'rb') as f:
    tok_vocab = pickle.load(f)

with open('../data/cfq/rel-vocab.pickle', 'rb') as f:
    rel_vocab = pickle.load(f)

In [66]:
test_str = "! \" # $ % & ' ( ) * + , - . / ~"
print("     " + test_str)
print(tokenizer.decode(tokenizer.encode(test_str)))

     ! " # $ % & ' ( ) * + , - . / ~
[CLS]! " # $ % &'( ) * +, -. / ~ [SEP]


In [20]:
import numpy as np

tok_vocab = pickle.load(open("../data/cfq/tok-vocab.pickle", "rb"))
rel_vocab = pickle.load(open("../data/cfq/rel-vocab.pickle", "rb"))

x = np.load('../data/cfq/splits/random_split.npz')
data = np.load('../data/cfq/data.npz')

In [28]:
for key in data:
    print(key, data[key], len(data[key]))

n_tok [13 21 13 ... 20 20 21] 239357
seq [62  5 10 ...  8 71 26] 4620808
isconcept [False False False ...  True False False] 4142094
isvariable [False False False ... False  True  True] 4142094
n [4 3 4 ... 5 5 4] 239357
tok [71 90  5 ... 26  5 87] 1246586
n_idx [1 1 1 ... 1 1 1] 1246586
idx [12  8  1 ... 20 15 17] 1246670
src [0 0 0 ... 0 0 1] 1436893
dst [2 1 1 ... 1 1 2] 1436893
m [ 4  7  4 ... 13 13  7] 239357
rel [ 5  9 26 ... 26 27 19] 1436893


In [49]:
arr = [ 62,   5,  38, 117,  16,  69, 117, 125, 117,  89, 117,  92,  69, 37, 126,   8,  12,   8, 71]

In [50]:
tokenizer.decode(tokenizer.encode(" ".join([tok_vocab[0][z] for z in arr] + ['ns:m.02zsn'])))

'[CLS] Did M1 write, executive produce, direct, edit, and produce a film [SEP] ns : film. film [SEP]? x0 ns : m. 02zsn [SEP]'

In [46]:
tok_vocab[0]

['Chinese',
 'parent',
 'nationality',
 'producer',
 'gender',
 'M1',
 'played',
 'male',
 '[SEP]',
 'M3',
 "'s",
 'ns:m.059j2',
 'ns:film.film',
 'ns:m.0d060g',
 'ns:film.editor',
 'production',
 'executive',
 'ns:m.0f8l9c',
 '?x4',
 'character',
 'ns:film.film_costumer_designer',
 'was',
 'Italian',
 'found',
 'What',
 'writer',
 '?x1',
 '?x3',
 'ns:m.06mkj',
 'editor',
 'play',
 'M9',
 'Who',
 'sibling',
 'ns:m.07ssc',
 'ns:m.0d05w3',
 'employed',
 'a',
 'write',
 'wrote',
 'country',
 'Was',
 'art',
 'German',
 'by',
 'M8',
 'female',
 'M5',
 'ns:film.film_distributor',
 'ns:m.0d0vqn',
 'M7',
 'distributed',
 'ns:m.03_3d',
 'French',
 'ns:film.actor',
 'employer',
 'did',
 'ns:film.film_art_director',
 'ns:m.0345h',
 'of',
 'acquire',
 'prequel',
 'Did',
 'director',
 'Dutch',
 'ns:film.director',
 'sequel',
 'costume',
 'child',
 'produce',
 'ns:m.02zsn',
 '?x0',
 '?x2',
 'ns:m.03rjj',
 'M2',
 'Swedish',
 'designer',
 'American',
 '?x5',
 'founder',
 'founded',
 'Which',
 'star',
