In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from os.path import basename, dirname
from shutil import copyfile

import json
from collections import OrderedDict

import torch

from fairseq.models.transformer import TransformerModel
from fairseq.tasks.translation import TranslationTask
from fairseq.hub_utils import GeneratorHubInterface

from transformers import WEIGHTS_NAME, logging

In [3]:
os.chdir("..")

In [4]:
#from transformers import FSMTForConditionalGeneration
from da.fsmt.modeling_fsmt import FSMTForConditionalGeneration
from da.fsmt.tokenization_fsmt import FSMTTokenizer
from da.fsmt.convert_fsmt_original_pytorch_checkpoint_to_pytorch import convert_fsmt_checkpoint_to_pytorch

In [5]:
from types import MethodType

from da.greedy_search_interpret import greedy_search_interpret

# Convert checkpoint

In [29]:
domain_names = ["Europarl", "OpenSubtitles", "JRC-Acquis", "EMEA"]

for main_name in domain_names:
    exp_path = f"experiments/en_et_{main_name}_ft"
    fseq_checkpoint_path = f"{exp_path}/checkpoint100.pt"
    save_dir = f"{exp_path}/hf"
    #data_path = f"experiments/bin-data-en-et-{main_name}/"
    data_path = f"experiments/bin-data-en-et-{main_name}-ft/"

    #spm_model_file = "data-prep/preproc-models/syscl-en-et.model"
    spm_model_file=None
    
    convert_fsmt_checkpoint_to_pytorch(
        fseq_checkpoint_path, 
        save_dir, 
        data_path, 
        spm_model_file
        )

Writing results to experiments/en_et_Europarl_ft/hf
Generating experiments/en_et_Europarl_ft/hf/config.json
Generating experiments/en_et_Europarl_ft/hf/pytorch_model.bin
Generating experiments/en_et_Europarl_ft/hf/vocab.txt
Conversion is done!
Writing results to experiments/en_et_OpenSubtitles_ft/hf
Generating experiments/en_et_OpenSubtitles_ft/hf/config.json
Generating experiments/en_et_OpenSubtitles_ft/hf/pytorch_model.bin
Generating experiments/en_et_OpenSubtitles_ft/hf/vocab.txt
Conversion is done!
Writing results to experiments/en_et_JRC-Acquis_ft/hf
Generating experiments/en_et_JRC-Acquis_ft/hf/config.json
Generating experiments/en_et_JRC-Acquis_ft/hf/pytorch_model.bin
Generating experiments/en_et_JRC-Acquis_ft/hf/vocab.txt
Conversion is done!
Writing results to experiments/en_et_EMEA_ft/hf
Generating experiments/en_et_EMEA_ft/hf/config.json
Generating experiments/en_et_EMEA_ft/hf/pytorch_model.bin
Generating experiments/en_et_EMEA_ft/hf/vocab.txt
Conversion is done!


In [30]:
exp_path = f"experiments/en_et_concat60"
fseq_checkpoint_path = f"{exp_path}/checkpoint60.pt"
save_dir = f"{exp_path}/hf"
data_path = f"experiments/bin-data-en-et-base"
spm_model_file=None

convert_fsmt_checkpoint_to_pytorch(
    fseq_checkpoint_path, 
    save_dir, 
    data_path, 
    spm_model_file
    )

Writing results to experiments/en_et_concat60/hf
Generating experiments/en_et_concat60/hf/config.json
Generating experiments/en_et_concat60/hf/pytorch_model.bin
Generating experiments/en_et_concat60/hf/vocab.txt
Conversion is done!


In [31]:
exp_path = f"experiments/en_et_concat101"
fseq_checkpoint_path = f"{exp_path}/checkpoint101.pt"
save_dir = f"{exp_path}/hf"
data_path = f"experiments/bin-data-en-et-base"
spm_model_file=None

convert_fsmt_checkpoint_to_pytorch(
    fseq_checkpoint_path, 
    save_dir, 
    data_path, 
    spm_model_file
    )

Writing results to experiments/en_et_concat101/hf
Generating experiments/en_et_concat101/hf/config.json
Generating experiments/en_et_concat101/hf/pytorch_model.bin
Generating experiments/en_et_concat101/hf/vocab.txt
Conversion is done!


# Test

In [None]:
tokenizer_hf = FSMTTokenizer.from_pretrained(save_dir) #, spm_model=f"{save_dir}/spm_model.spm")
model_hf = FSMTForConditionalGeneration.from_pretrained(save_dir)

### Single sentence

In [10]:
# Monkeypatch the function on object
model_hf.greedy_search = MethodType(greedy_search_interpret, model_hf)

In [11]:
# we work on single sentences to avoid handling padding on the output

In [12]:
src = "So be it."

In [13]:
src = "▁So ▁be ▁it ."

In [14]:
sentence = tokenizer_hf.encode_plus(
    src,
    padding="longest", 
    return_tensors="pt",
    return_token_type_ids=False,
    return_attention_mask=False
)

In [15]:
sentence

{'input_ids': tensor([[690,  23,  46,   5,   2]])}

In [16]:
tokenizer_hf.batch_decode(sentence['input_ids'])

['▁So ▁be ▁it. </s>']

In [17]:
res = model_hf.generate(**sentence,
                       #return_dict=True,
                       output_hidden_states=True,
                       output_attentions=True,
                       do_sample=False,
                       num_beams=1)

In [18]:
res['output_ids']

tensor([[   2, 2858,  185,    5,    2]])

In [19]:
tokenizer_hf.batch_decode(res['output_ids'])

['</s> ▁Olgu ▁nii. </s>']

### Batch

In [27]:
# src = ["my name is Max.", "my name is Alex.", "i go to school every day (no)"] # should be bpe split

# batch = tokenizer_hf.batch_encode_plus(
#     src,
#     padding="longest", 
#     return_tensors="pt",
#     return_token_type_ids=False,
#     return_attention_mask=True
# )

# r2 = model_hf.generate(**batch,
#                        output_hidden_states=True,
#                        output_attentions=True,
#                        num_beams=1,
#                        do_sample=False)

# tokenizer_hf.batch_decode(r2['output_ids'], skip_special_tokens=False)