In [1]:
import os
from os.path import basename, dirname
from shutil import copyfile

import json
from collections import OrderedDict

import torch
import sentencepiece as spm

from fairseq.models.transformer import TransformerModel
from fairseq.tasks.translation import TranslationTask
from fairseq.hub_utils import GeneratorHubInterface

from transformers import WEIGHTS_NAME, logging
from transformers.configuration_fsmt import FSMTConfig
from transformers.modeling_fsmt import FSMTForConditionalGeneration

In [2]:
os.chdir("..")

In [3]:
from da.hugging_fseq_tokenizer import HuggingFseqTokenizer
from da.convert_fsmt_simple_checkpoint_to_pytorch import convert_fsmt_simple_checkpoint_to_pytorch

# Convert checkpoint

In [4]:
exp_path = "experiments/concat"
fseq_checkpoint_path = f"{exp_path}/checkpoint_best.pt"
save_dir = f"{exp_path}/hf"
data_path = "data-prep/bin-data-en-et-base/"
spm_model_file = "data-prep/preproc-models/syscl-en-et.model"

In [5]:
convert_fsmt_simple_checkpoint_to_pytorch(
    fseq_checkpoint_path, 
    save_dir, 
    data_path, 
    spm_model_file
)

Writing results to experiments/concat_base/hf
Generating experiments/concat_base/hf/config.json
Generating experiments/concat_base/hf/pytorch_model.bin
Generating experiments/concat_base/hf/vocab.txt
Conversion is done!


In [6]:
tokenizer_hf = HuggingFseqTokenizer.from_pretrained(save_dir)
model_hf = FSMTForConditionalGeneration.from_pretrained(save_dir)

# Test

### Single sentence

In [53]:
src = ["my name is Max."]

In [54]:
batch = tokenizer_hf.batch_encode_plus(
    src,
    padding="longest", 
    return_tensors="pt",
    return_token_type_ids=False,
    return_attention_mask=True
)

In [55]:
out_batch = model_hf.generate(**batch,
                       return_dict=True,
                       output_hidden_states=True,
                       output_attentions=True)

In [56]:
out_batch

tensor([[   2,  283, 1713,    9, 8570,    5,    2]])

In [57]:
tokenizer_hf.batch_decode(out_batch)

['minu nimi on Max.']

### Batch

In [58]:
src = ["my name is Max.", "my name is Alex.", "i go to school every day (no)"]

In [59]:
batch = tokenizer_hf.batch_encode_plus(
    src,
    padding="longest", 
    return_tensors="pt",
    return_token_type_ids=False,
    return_attention_mask=True
)

In [60]:
model_hf.generate??

In [61]:
out_batch = model_hf.generate(**batch,
                       return_dict=True,
                       output_hidden_states=True,
                       output_attentions=True)

In [62]:
out_batch

tensor([[    2,   283,  1713,     9,  8570,     5,     2,     1,     1,     1,
             1],
        [    2,   283,  1713,     9, 12102,     5,     2,     1,     1,     1,
             1],
        [    2,    87,  3302,   138, 17849,   257,  2686,    16,  5682,    15,
             2]])

In [63]:
tokenizer_hf.batch_decode(out_batch)

['minu nimi on Max.<pad><pad><pad><pad>',
 'minu nimi on Alex.<pad><pad><pad><pad>',
 'Ma käin koolis iga päev (ei)']

In [64]:
out_batch = model_hf.forward(**batch,
                       return_dict=True,
                       output_hidden_states=True,
                       output_attentions=True)

In [65]:
out_batch.keys()

odict_keys(['logits', 'decoder_hidden_states', 'decoder_attentions', 'encoder_last_hidden_state', 'encoder_hidden_states', 'encoder_attentions'])