In [1]:
from transformers import MarianMTModel, MarianTokenizer
import torch
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de', cache_dir='.embeddings')
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de', cache_dir='.embeddings')




In [3]:
src_text = ['I am going to tokyo by bus']
tgt_text = ['Ich fahre mit dem Bus nach Tokio']
tokenized = tokenizer.prepare_translation_batch(src_text, tgt_text)
tokenized

{'input_ids': tensor([[  38,  121,  751,   12,   12, 6076,  166,   54, 4843,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'decoder_input_ids': tensor([[  105, 29049,    30,    57,  3726,    96, 31597,     0]]), 'decoder_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:
def shift_tokens_right(input_ids, pad_token_id):
    """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
    prev_output_tokens = input_ids.clone()
    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
    prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
    prev_output_tokens[:, 1:] = input_ids[:, :-1]
    return prev_output_tokens


In [5]:
translated = model.generate(tokenized['input_ids'], temperature=0.8)
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
print(tgt_text)


outputs = model(tokenized['input_ids'])
logits = outputs[0]
tgt_text2 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text2)


outputs = model(tokenized['input_ids'], decoder_input_ids=tokenized['decoder_input_ids'])
logits = outputs[0]
tgt_text3 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text3)


outputs = model(tokenized['input_ids'], decoder_input_ids=tokenized['decoder_input_ids'][:, :-1].contiguous())
logits = outputs[0]
tgt_text4 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text4)


decoder_input_ids = shift_tokens_right(tokenized['decoder_input_ids'], tokenizer.pad_token_id)
outputs = model(tokenized['input_ids'], decoder_input_ids=decoder_input_ids)
logits = outputs[0]
tgt_text5 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text5)


['Ich fahre mit dem Bus nach Tokyo']
['Ich werde go tokykyo mit ']
['Bus To']
['Bus To']
['Ich fahre mit dem Bus nach Tokio']


In [6]:
tokenizer.pad_token_id

58100

In [7]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large', cache_dir='.embeddings')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', cache_dir='.embeddings')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1525.0, style=ProgressStyle(description…




In [18]:
src_text = ['who <mask> a 8 star rating with over 8 reviews in " fonte " ?']
tgt_text = ['who <mask> a 8 star rating with over 8 reviews in " fonte " ?']
encoded_input_ids = tokenizer.batch_encode_plus(src_text)['input_ids']
encoded_input_attention_masks = tokenizer.batch_encode_plus(src_text)['attention_mask']
encoded_output_ids = tokenizer.batch_encode_plus(tgt_text)['input_ids']

In [21]:
tokenized = {}
tokenized['input_ids'] = torch.tensor(encoded_input_ids)
tokenized['attention_mask'] = torch.tensor(encoded_input_attention_masks)
tokenized['decoder_input_ids'] = torch.tensor(encoded_output_ids)
print(tokenized['input_ids'])
print(tokenized['decoder_input_ids'])

tensor([[    0,    54, 50264,    10,   290,   999,   691,    19,    81,   290,
          6173,    11,    22,   856, 13757,    22, 17487,     2]])
tensor([[    0,    54, 50264,    10,   290,   999,   691,    19,    81,   290,
          6173,    11,    22,   856, 13757,    22, 17487,     2]])


In [20]:
translated = model.generate(tokenized['input_ids'],
                                 bad_words_ids=None,
                                 attention_mask=tokenized['attention_mask'],
                                 decoder_start_token_id=0,
                                 min_length=5,
                                 max_length=20,
                                 num_beams=1,
                                 top_k=0,
                                 top_p=0.9,
                                 early_stopping=True,
                                 num_return_sequences=1,
                                 repetition_penalty=1.0,
                                 no_repeat_ngram_size=0,
                                 do_sample=True,
                                 temperature=1.0, # if temperature==0, we do not sample
                                 use_cache=False)
tgt_text = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=False) for t in translated]
print(tgt_text)


outputs = model(tokenized['input_ids'])
logits = outputs[0]
tgt_text2 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text2)


outputs = model(tokenized['input_ids'], decoder_input_ids=tokenized['decoder_input_ids'])
logits = outputs[0]
tgt_text3 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text3)


outputs = model(tokenized['input_ids'], decoder_input_ids=tokenized['decoder_input_ids'][:, :-1].contiguous())
logits = outputs[0]
tgt_text4 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text4)


print(tokenized['decoder_input_ids'])
decoder_input_ids = shift_tokens_right(tokenized['decoder_input_ids'], tokenizer.pad_token_id)
print(decoder_input_ids)
outputs = model(tokenized['input_ids'], decoder_input_ids=decoder_input_ids)
logits = outputs[0]
tgt_text5 = [tokenizer.decode(t, skip_special_tokens=True) for t in logits.max(-1)[1]]
print(tgt_text5)


['WhoDo you have a 8 star rating with over 8 reviews in " fonte " ?']
['WhoWho has the fan star rating with over 8 reviews in " fonte "?']
['Who who. 8 star rating with over 8 reviews in " fonte "?']
['Who who. 8 star rating with over 8 reviews in " fonte "?']
tensor([[    0,    54, 50264,    10,   290,   999,   691,    19,    81,   290,
          6173,    11,    22,   856, 13757,    22, 17487,     2]])
tensor([[    2,     0,    54, 50264,    10,   290,   999,   691,    19,    81,
           290,  6173,    11,    22,   856, 13757,    22, 17487]])
['WhoWho has the fan star rating with over 8 reviews in " fonte "?']


In [106]:
tokenizer.pad_token_id

1

In [1]:
import ujson

In [3]:
wikidata_types_0905 = ujson.load(open('./bootleg_material/emb_data/wikidata_types_0905.json'))

In [6]:
wikidata_to_typeid_0905 = ujson.load(open('./bootleg_material/emb_data/wikidata_to_typeid_0905.json'))

In [7]:
wikidatatitle_to_typeid_0905 = ujson.load(open('./bootleg_material/emb_data/wikidatatitle_to_typeid_0905.json'))

In [9]:
bootlegTypeID_to_wikidataTypeQID = {}
for k, v in wikidata_to_typeid_0905.items():
    bootlegTypeID_to_wikidataTypeQID[v] = wikidatatitle_to_typeid_0905[k]

    

In [11]:
ujson.dump(bootlegTypeID_to_wikidataTypeQID, open('./bootleg_material/emb_data/bootlegTypeID_to_wikidataTypeQID.json', 'w'))

In [19]:
wikidataTypeQID_to_bootlegTypeID = ujson.load(open('./bootleg_material/emb_data/wikidataqid_to_bootlegtypeid.json'))
bootlegTypeID_to_wikidataTypeQID = {v:k for k,v in wikidataTypeQID_to_bootlegTypeID.items()}

from collections import defaultdict
entityQID_to_wikidataTypeQID = defaultdict(list)
for k, v in wikidata_types_0905.items():
    entityQID_to_wikidataTypeQID[k] = [bootlegTypeID_to_wikidataTypeQID[val] for val in v]


In [20]:
ujson.dump(entityQID_to_wikidataTypeQID, open('./bootleg_material/emb_data/entityQID_to_wikidataTypeQID.json', 'w'))