In [48]:
import pickle
import torch
from transformers import AutoModel, AutoConfig, AutoTokenizer

In [30]:
model = AutoModel.from_pretrained('bert-base-cased')
config = AutoConfig.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [41]:
encoded = torch.as_tensor(tokenizer.encode("Hello", padding='max_length')).unsqueeze(0)
unpooled_output, pooled_output = model(input_ids=encoded)
print(encoded.shape, unpooled_output.shape)

torch.Size([1, 512]) torch.Size([1, 512, 768])


In [47]:
tokenizer.encode("{ }")

[101, 196, 198, 102]

In [51]:
with open('../data/cfq/tok-vocab.pickle', 'rb') as f:
    tok_vocab = pickle.load(f)

with open('../data/cfq/rel-vocab.pickle', 'rb') as f:
    rel_vocab = pickle.load(f)

In [55]:
for x in tok_vocab[0]:
    encoded = tokenizer.encode(x)
    decoded = tokenizer.decode(encoded)
    print(x, decoded)

Chinese [CLS] Chinese [SEP]
parent [CLS] parent [SEP]
nationality [CLS] nationality [SEP]
producer [CLS] producer [SEP]
gender [CLS] gender [SEP]
M1 [CLS] M1 [SEP]
played [CLS] played [SEP]
male [CLS] male [SEP]
[SEP] [CLS] [SEP] [SEP]
M3 [CLS] M3 [SEP]
's [CLS]'s [SEP]
ns:m.059j2 [CLS] ns : m. 059j2 [SEP]
ns:film.film [CLS] ns : film. film [SEP]
ns:m.0d060g [CLS] ns : m. 0d060g [SEP]
ns:film.editor [CLS] ns : film. editor [SEP]
production [CLS] production [SEP]
executive [CLS] executive [SEP]
ns:m.0f8l9c [CLS] ns : m. 0f8l9c [SEP]
?x4 [CLS]? x4 [SEP]
character [CLS] character [SEP]
ns:film.film_costumer_designer [CLS] ns : film. film _ costumer _ designer [SEP]
was [CLS] was [SEP]
Italian [CLS] Italian [SEP]
found [CLS] found [SEP]
What [CLS] What [SEP]
writer [CLS] writer [SEP]
?x1 [CLS]? x1 [SEP]
?x3 [CLS]? x3 [SEP]
ns:m.06mkj [CLS] ns : m. 06mkj [SEP]
editor [CLS] editor [SEP]
play [CLS] play [SEP]
M9 [CLS] M9 [SEP]
Who [CLS] Who [SEP]
sibling [CLS] sibling [SEP]
ns:m.07ssc [CLS] n

In [56]:
for x in rel_vocab[0]:
    encoded = tokenizer.encode(x)
    decoded = tokenizer.decode(encoded)
    print(x, decoded)

!= [CLS]! = [SEP]
^ns:people.person.gender [CLS] ^ ns : people. person. gender [SEP]
^ns:people.person.nationality [CLS] ^ ns : people. person. nationality [SEP]
a [CLS] a [SEP]
ns:business.employer.employees/ns:business.employment_tenure.person [CLS] ns : business. employer. employees / ns : business. employment _ tenure. person [SEP]
ns:film.actor.film/ns:film.performance.character [CLS] ns : film. actor. film / ns : film. performance. character [SEP]
ns:film.actor.film/ns:film.performance.film [CLS] ns : film. actor. film / ns : film. performance. film [SEP]
ns:film.cinematographer.film [CLS] ns : film. cinematographer. film [SEP]
ns:film.director.film [CLS] ns : film. director. film [SEP]
ns:film.editor.film [CLS] ns : film. editor. film [SEP]
ns:film.film.cinematography [CLS] ns : film. film. cinematography [SEP]
ns:film.film.costume_design_by [CLS] ns : film. film. costume _ design _ by [SEP]
ns:film.film.directed_by [CLS] ns : film. film. directed _ by [SEP]
ns:film.film.distrib

In [66]:
test_str = "! \" # $ % & ' ( ) * + , - . / ~"
print("     " + test_str)
print(tokenizer.decode(tokenizer.encode(test_str)))

     ! " # $ % & ' ( ) * + , - . / ~
[CLS]! " # $ % &'( ) * +, -. / ~ [SEP]


 








	























 
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~

































 
¡
¢
£
¤
¥
¦
§
¨
©
ª
«
¬
­
®
¯
°
±
²
³
´
µ
¶
·
¸
¹
º
»
¼
½
¾
¿
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
×
Ø
Ù
Ú
Û
Ü
Ý
Þ
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
÷
ø
ù
ú
û
ü
ý
þ
ÿ
