In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from os.path import basename, dirname
from shutil import copyfile

import json
from collections import OrderedDict

import torch
import sentencepiece as spm

from fairseq.models.transformer import TransformerModel
from fairseq.tasks.translation import TranslationTask
from fairseq.hub_utils import GeneratorHubInterface

from transformers import WEIGHTS_NAME, logging

In [3]:
os.chdir("..")

In [5]:
from transformers import FSMTForConditionalGeneration
from da.hugging_fseq_tokenizer import HuggingFseqTokenizer
from da.convert_fsmt_simple_checkpoint_to_pytorch import convert_fsmt_simple_checkpoint_to_pytorch

In [24]:
from types import MethodType

from da.greedy_search_interpret import greedy_search_interpret

# Convert checkpoint

In [10]:
exp_path = "experiments/concat"
fseq_checkpoint_path = f"{exp_path}/checkpoint_best.pt"
save_dir = f"{exp_path}/hf"
data_path = "data-prep/bin-data-en-et-base/"
spm_model_file = "data-prep/preproc-models/syscl-en-et.model"

In [11]:
convert_fsmt_simple_checkpoint_to_pytorch(
    fseq_checkpoint_path, 
    save_dir, 
    data_path, 
    spm_model_file
)

Writing results to experiments/concat/hf
Generating experiments/concat/hf/config.json
Generating experiments/concat/hf/pytorch_model.bin
Generating experiments/concat/hf/vocab.txt
Conversion is done!


In [12]:
tokenizer_hf = HuggingFseqTokenizer.from_pretrained(save_dir)
model_hf = FSMTForConditionalGeneration.from_pretrained(save_dir)

# Test

### Single sentence

In [16]:
# we work on single sentences to avoid handling padding on the output

In [17]:
src = "i go to school every day (no)."

In [28]:
sentence = tokenizer_hf.encode_plus(
    src,
    padding="longest", 
    return_tensors="pt",
    return_token_type_ids=False,
    return_attention_mask=True
)

In [29]:
sentence

{'input_ids': tensor([[ 302,  449,    8, 4543,  668,  428,   16, 1048,  149,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [32]:
tokenizer_hf.batch_decode(sentence['input_ids'])

['i go to school every day (no).']

In [33]:
# Monkeypatch the function on object

In [34]:
model_hf.greedy_search = MethodType(greedy_search_interpret, model_hf)

In [35]:
res = model_hf.generate(**sentence,
                       #return_dict=True,
                       output_hidden_states=True,
                       output_attentions=True,
                       do_sample=False,
                       num_beams=1)

In [38]:
res.keys()

dict_keys(['output_ids', 'encoder_hidden_states', 'encoder_attentions', 'decoder_hidden_states', 'decoder_attentions', 'decoder_cross_attentions', 'logits'])

In [39]:
tokenizer_hf.batch_decode(res['output_ids'])

['Ma käin koolis iga päev (ei).']

### Batch

In [40]:
src = ["my name is Max.", "my name is Alex.", "i go to school every day (no)"]

In [41]:
batch = tokenizer_hf.batch_encode_plus(
    src,
    padding="longest", 
    return_tensors="pt",
    return_token_type_ids=False,
    return_attention_mask=True
)

In [43]:
r2 = model_hf.generate(**batch,
                       output_hidden_states=True,
                       output_attentions=True,
                       num_beams=1,
                       do_sample=False)

In [48]:
r2.keys()

dict_keys(['output_ids', 'encoder_hidden_states', 'encoder_attentions', 'decoder_hidden_states', 'decoder_attentions', 'decoder_cross_attentions', 'logits'])

In [51]:
tokenizer_hf.batch_decode(r2['output_ids'], skip_special_tokens=False)

['minu nimi on Max.<pad><pad><pad><pad>',
 'minu nimi on Alex.<pad><pad><pad><pad>',
 'Ma käin koolis iga päev (ei)']