In [59]:
import torch
from transformers import SeamlessM4TTokenizer, SeamlessM4TTokenizerFast

In [60]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [61]:
version = "facebook/hf-seamless-m4t-medium"
src_lang = "eng"
tgt_lang = "cmn"
example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
expected_translation_cmn = "联合国秘书长表示叙利亚问题没有军事解决方案"

# SeamlessM4TTokenizer

In [62]:
tokenizer = SeamlessM4TTokenizer.from_pretrained(version, src_lang=src_lang, tgt_lang=tgt_lang)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


SeamlessM4TTokenizer(name_or_path='facebook/hf-seamless-m4t-medium', vocab_size=256000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'additional_special_tokens': ['<pad>', '<unk>', '<s>', '</s>', '__ace__', '__ace_Latn__', '__acm__', '__acq__', '__aeb__', '__afr__', '__ajp__', '__aka__', '__amh__', '__apc__', '__arb__', '__ars__', '__ary__', '__arz__', '__asm__', '__ast__', '__awa__', '__ayr__', '__azb__', '__azj__', '__bak__', '__bam__', '__ban__', '__bel__', '__bem__', '__ben__', '__bho__', '__bjn__', '__bjn_Latn__', '__bod__', '__bos__', '__bug__', '__bul__', '__cat__', '__ceb__', '__ces__', '__cjk__', '__ckb__', '__crh__', '__cym__', '__dan__', '__deu__', '__dik__', '__dyu__', '__dzo__', '__ell__', '__eng__', '__epo__', '__est__', '__eus__', '__ewe__', '__fao__', '__pes__', '__fij__', '__fin__', '__fon_

## special ids and tokens

In [63]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[2, 3, 1, 0, 256001, 256002, 256003, 256004, 256005, 256006, 256007, 256008, 256009, 256010, 256011, 256012, 256013, 256014, 256015, 256016, 256017, 256018, 256019, 256020, 256021, 256022, 256023, 256024, 256025, 256026, 256027, 256028, 256029, 256030, 256031, 256032, 256033, 256034, 256035, 256036, 256037, 256038, 256039, 256040, 256041, 256042, 256043, 256044, 256045, 256046, 256047, 256048, 256049, 256050, 256051, 256052, 256053, 256054, 256055, 256056, 256057, 256058, 256059, 256060, 256061, 256062, 256063, 256064, 256065, 256066, 256067, 256068, 256069, 256070, 256071, 256072, 256073, 256074, 256075, 256076, 256077, 256078, 256079, 256080, 256081, 256082, 256083, 256084, 256085, 256086, 256087, 256088, 256089, 256090, 256091, 256092, 256093, 256094, 256095, 256096, 256097, 256098, 256099, 256100, 256101, 256102, 256103, 256104, 256105, 256106, 256107, 256108, 256109, 256110, 256111, 256112, 256113, 256114, 256115, 256116, 256117, 256118, 256119, 256120, 256121, 256122, 256123, 256

## tokenize(sequence) 对文本进行分词

In [64]:
tokens = tokenizer.tokenize(text=example_english_phrase)
tokens

['▁UN',
 '▁Chief',
 '▁Say',
 's',
 '▁There',
 '▁Is',
 '▁No',
 '▁Milit',
 'ary',
 '▁S',
 'olution',
 '▁in',
 '▁Syria']

## convert_tokens_to_ids 将分词后的token映射为数字

In [65]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[16297,
 134408,
 8165,
 248066,
 14734,
 950,
 1135,
 105721,
 3573,
 83,
 27352,
 108,
 49486]

## encode = tokenize + convert_tokens_to_ids

In [66]:
print(tokenizer.encode(example_english_phrase, add_special_tokens=False))
ids = tokenizer.encode(example_english_phrase, add_special_tokens=True)
ids

[16297, 134408, 8165, 248066, 14734, 950, 1135, 105721, 3573, 83, 27352, 108, 49486]


[3,
 256200,
 16297,
 134408,
 8165,
 248066,
 14734,
 950,
 1135,
 105721,
 3573,
 83,
 27352,
 108,
 49486,
 3]

In [67]:
tokenizer.convert_ids_to_tokens([256047, 3])

['__eng__', '</s>']

## batch_encode_plus = batch encode

In [68]:
tokenizer.batch_encode_plus([example_english_phrase], add_special_tokens=True, padding=True)

{'input_ids': [[3, 256200, 16297, 134408, 8165, 248066, 14734, 950, 1135, 105721, 3573, 83, 27352, 108, 49486, 3]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## convert_ids_to_tokens 将数字映射为token

In [69]:
print(tokenizer.convert_ids_to_tokens(ids))
tokens = tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
tokens

['</s>', '__cmn__', '▁UN', '▁Chief', '▁Say', 's', '▁There', '▁Is', '▁No', '▁Milit', 'ary', '▁S', 'olution', '▁in', '▁Syria', '</s>']


['▁UN',
 '▁Chief',
 '▁Say',
 's',
 '▁There',
 '▁Is',
 '▁No',
 '▁Milit',
 'ary',
 '▁S',
 'olution',
 '▁in',
 '▁Syria']

## convert_tokens_to_string 将token转换为string

In [70]:
tokenizer.convert_tokens_to_string(tokens)

'UN Chief Says There Is No Military Solution in Syria'

## decode = convert_ids_to_tokens + convert_tokens_to_string

In [71]:
print(tokenizer.decode(ids))
print(tokenizer.decode(ids, skip_special_tokens=True))

</s> __cmn__ UN Chief Says There Is No Military Solution in Syria</s>
UN Chief Says There Is No Military Solution in Syria


## batch_decode = batch decode

In [72]:
print(tokenizer.batch_decode([ids]))
print(tokenizer.batch_decode([ids], skip_special_tokens=True))

['</s> __cmn__ UN Chief Says There Is No Military Solution in Syria</s>']
['UN Chief Says There Is No Military Solution in Syria']


## tokenizer([sequence])

In [73]:
inputs = tokenizer(
    text=example_english_phrase,
    text_target=expected_translation_cmn,
    padding = True,                         # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    return_length = True,                   # 返回有效长度
    return_attention_mask = True,           # 返回attention_mask
    return_tensors="pt",
)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'attention_mask', 'length', 'labels'])
tensor([[256047,  16297, 134408,   8165, 248066,  14734,    950,   1135, 105721,
           3573,     83,  27352,    108,  49486,      3,      0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
tensor([16])


In [74]:
inputs = tokenizer(
    text=[example_english_phrase] * 2,
    text_target=expected_translation_cmn,
    padding = True,                         # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    return_length = True,                   # 返回有效长度
    return_attention_mask = True,           # 返回attention_mask
    return_tensors="pt",
)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask', 'labels'])
tensor([[256047,  16297, 134408,   8165, 248066,  14734,    950,   1135, 105721,
           3573,     83,  27352,    108,  49486,      3,      0],
        [256047,  16297, 134408,   8165, 248066,  14734,    950,   1135, 105721,
           3573,     83,  27352,    108,  49486,      3,      0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
tensor([15, 15])


# SeamlessM4TTokenizerFast

In [75]:
tokenizer: SeamlessM4TTokenizerFast = SeamlessM4TTokenizerFast.from_pretrained(version)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


SeamlessM4TTokenizerFast(name_or_path='facebook/hf-seamless-m4t-medium', vocab_size=256001, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'additional_special_tokens': ['<pad>', '<unk>', '<s>', '</s>', '__ace__', '__ace_Latn__', '__acm__', '__acq__', '__aeb__', '__afr__', '__ajp__', '__aka__', '__amh__', '__apc__', '__arb__', '__ars__', '__ary__', '__arz__', '__asm__', '__ast__', '__awa__', '__ayr__', '__azb__', '__azj__', '__bak__', '__bam__', '__ban__', '__bel__', '__bem__', '__ben__', '__bho__', '__bjn__', '__bjn_Latn__', '__bod__', '__bos__', '__bug__', '__bul__', '__cat__', '__ceb__', '__ces__', '__cjk__', '__ckb__', '__crh__', '__cym__', '__dan__', '__deu__', '__dik__', '__dyu__', '__dzo__', '__ell__', '__eng__', '__epo__', '__est__', '__eus__', '__ewe__', '__fao__', '__pes__', '__fij__', '__fin__', '__f

In [76]:
inputs = tokenizer(
    text=[example_english_phrase] * 2,
    text_target=expected_translation_cmn,
    padding = True,                         # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    return_length = True,                   # 返回有效长度
    return_attention_mask = True,           # 返回attention_mask
    return_tensors="pt",
)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'attention_mask', 'length', 'labels'])
tensor([[256047,  16297, 134408,   8165, 248066,  14734,    950,   1135, 105721,
           3573,     83,  27352,    108,  49486,      3,      0],
        [256047,  16297, 134408,   8165, 248066,  14734,    950,   1135, 105721,
           3573,     83,  27352,    108,  49486,      3,      0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
tensor([16, 16])
