<a href="https://colab.research.google.com/github/LesterLian/mBERT-fused-M2M100/blob/main/generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# It is recommended to save the models directly on google drive to avoid any data loss during connectivity issues. 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! mkdir -p /content/drive/MyDrive/Multilingual_Project
% cd /content/drive/MyDrive/Multilingual_Project

/content/drive/MyDrive/Multilingual_Project


In [3]:
# Prepare modified transformers
# ! wget https://github.com/huggingface/transformers/archive/refs/tags/v4.4.2.zip
# ! unzip v4.4.2.zip
# ! cp -rf transformers-4.4.2/src/transformers/ ./
# ! git clone https://github.com/LesterLian/mBERT-fused-M2M100.git
# ! cp -rf mBERT-fused-M2M100/transformers/* ./transformers/
! rm -rf transformers-4.4.2 v4.4.2.zip mBERT-fused-M2M100

In [None]:
#Transformers and sentencepiece are installed
!pip install transformers
!pip install sentencepiece

In [5]:
# Fused Model Definition
from transformers import M2M100ForConditionalGeneration, M2M100Model, M2M100Tokenizer, BertModel, BertTokenizer

class FusedM2M(M2M100ForConditionalGeneration):
    def __init__(self, bert: BertModel, m2m: M2M100Model, path: str = None, bert_input=None):
        super().__init__(m2m.config)
        self.bert = bert
        self.m2m = m2m
        self.model = m2m.model
        self.base_model = m2m.base_model
        self.fuse_layer_path = path
        self.bert_input = bert_input

        if self.bert_input:
            # Get BERT embedding
            bert_output = self.bert(**bert_input).last_hidden_state
            # Get BERT attention outputs
            attention_outputs = self.bert(**bert_input, embedding_input=bert_output).attention_outputs
            # Pass in BERT attention outputs to M2M layers
            for i in range(len(attention_outputs)):
                self.m2m.model.encoder.layers[i].bert_attention_output = attention_outputs[i]
            # Load fuse layer
            if self.fuse_layer_path:
                m2m.load_state_dict(torch.load(self.fuse_layer_path))

    def forward(self, *input, **kwargs):
        return self.m2m(*input, **kwargs)


In [6]:
#Tutorial of generation on fused BERT and M2M-100

hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
zh_text = "生活就像一盒巧克力。"
print('original sentence:')
print(hi_text)
print(zh_text)

m2m = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# translate Chinese to English

m2m_tokenizer.src_lang = "zh"
m2m_input = m2m_tokenizer(zh_text, return_tensors="pt")

generated_tokens = m2m.generate(**m2m_input, forced_bos_token_id=m2m_tokenizer.get_lang_id("en"))
print('M2M result:')
print(m2m_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
# print([m2m_tokenizer.decoder[int(id)] if int(id) in m2m_tokenizer.decoder else m2m_tokenizer.id_to_lang_token[int(id)]
#        for id in m2m_input.data['input_ids'][0]])

bert_type = 'bert-base-multilingual-cased'  # 'bert-base-multilingual-cased' or 'bert-large-multilingual-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_type)
bert = BertModel.from_pretrained(bert_type)
bert_input = bert_tokenizer(zh_text, return_tensors='pt')

# output = bert(**encoded_input)
# print([bert_tokenizer.ids_to_tokens[int(id)] for id in bert_input.data['input_ids'][0]])

# Fused model generation
fused_model = FusedM2M(bert, m2m, bert_input=bert_input)
generated_tokens = fused_model.generate(**m2m_input, forced_bos_token_id=m2m_tokenizer.get_lang_id("en"))
print('Fused model result:')
print(m2m_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))

original sentence:
जीवन एक चॉकलेट बॉक्स की तरह है।
生活就像一盒巧克力。


Some weights of M2M100ForConditionalGeneration were not initialized from the model checkpoint at facebook/m2m100_418M and are newly initialized: ['model.encoder.layers.0.fuse_layer.weight', 'model.encoder.layers.0.fuse_layer.bias', 'model.encoder.layers.1.fuse_layer.weight', 'model.encoder.layers.1.fuse_layer.bias', 'model.encoder.layers.2.fuse_layer.weight', 'model.encoder.layers.2.fuse_layer.bias', 'model.encoder.layers.3.fuse_layer.weight', 'model.encoder.layers.3.fuse_layer.bias', 'model.encoder.layers.4.fuse_layer.weight', 'model.encoder.layers.4.fuse_layer.bias', 'model.encoder.layers.5.fuse_layer.weight', 'model.encoder.layers.5.fuse_layer.bias', 'model.encoder.layers.6.fuse_layer.weight', 'model.encoder.layers.6.fuse_layer.bias', 'model.encoder.layers.7.fuse_layer.weight', 'model.encoder.layers.7.fuse_layer.bias', 'model.encoder.layers.8.fuse_layer.weight', 'model.encoder.layers.8.fuse_layer.bias', 'model.encoder.layers.9.fuse_layer.weight', 'model.encoder.layers.9.fuse_layer.b

M2M result:
['Life is like a box of chocolate.']
Fused model result:
['Life is like a box of chocolate.']
