In [6]:
from datasets import Dataset,load_dataset
from train_tokenizer import Config,get_all_sentences,get_or_train_tokenizer
config = Config()
config.datasource = 'Helsinki-NLP/opus-100'
config.lang_src =  'en'
config.lang_tgt =  'zh'
ds_raw = load_dataset(f"{config.datasource}", f"{config.lang_src}-{config.lang_tgt}",)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
ds_raw

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [3]:
from train_HFtokenizer import train_bbpe_tokenizer
tokenizer = train_bbpe_tokenizer(input_ds=ds_raw['train'],vocab_size=30000,lang=config.lang_tgt)




Tokenizer saved to bbpe_tokenizer_zh.json


In [8]:
text = get_all_sentences(ds_raw['train'],lang=config.lang_tgt)


In [10]:
text_iter= next(text)
text_iter

'减轻酸... 酸痛的药 减轻酸痛的药'

In [11]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("bbpe_tokenizer_zh.json")

In [12]:
tokenizer.get_vocab_size()

30000

In [13]:
text_iter

'减轻酸... 酸痛的药 减轻酸痛的药'

In [18]:
tokens_zh = tokenizer.encode(text_iter)
print(f"ids:{tokens_zh.ids}")
print(f"type_ids:{tokens_zh.type_ids}") # type_ids一般用于区分句子类型。例如，在BERT中，type_ids用于区分句子对中的两个句子，分别标记为0或1。在这个例子中，所有的type_ids都是0，表明这是一个单独的句子。
print(f"tokens:{tokens_zh.tokens}")
print(f"offsets:{tokens_zh.offsets}")

ids:[4501, 8796, 650, 175, 8796, 3014, 213, 1588, 175, 4501, 8796, 3014, 213, 1588]
type_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens:['åĩıè½»', 'éħ¸', '...', 'Ġ', 'éħ¸', 'çĹĽ', 'çļĦ', 'èį¯', 'Ġ', 'åĩıè½»', 'éħ¸', 'çĹĽ', 'çļĦ', 'èį¯']
offsets:[(0, 2), (2, 3), (3, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 14), (14, 15), (15, 16), (16, 17), (17, 18)]


In [15]:
tokens_zh = tokenizer.decode(tokens_zh.ids)
tokens_zh

'减轻酸... 酸痛的药 减轻酸痛的药'

In [16]:
id = [3609]
id2token = tokenizer.decode(id)
id2token

'他说'

In [None]:
def decode(self, ids):
     # given ids (list of integers), return Python string
    part_bytes = []
    for idx in ids:
        if idx in self.vocab:    # idx -> bytes   
            part_bytes.append(self.vocab[idx])  # 遍历ids中idx 在vocab找到对应的token utf-8表示
        elif idx in self.inverse_special_tokens: # 如果是特殊token对应的idx 在倒置的special token 字典里查找
            part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8")) # 然后在utf-8编码转化为token
        else:
            raise ValueError(f"invalid token id: {idx}")
    text_bytes = b"".join(part_bytes)
    text = text_bytes.decode("utf-8", errors="replace") # 对text_bytes进行utf-8解码形成token
    return text

In [23]:
text_bytes = b"".join(part_bytes)
text = text_bytes.decode("utf-8", errors="replace") # 对text_bytes进行utf-8解码形成token

'åĩıè½»éħ¸...Ġéħ¸çĹĽçļĦèį¯Ġåĩıè½»éħ¸çĹĽçļĦèį¯'

### 使用无改动hf tokenizer进行BPE训练 
- 该方法针对en数据集适用

In [3]:
from train_HFtokenizer import train_tokenizer

In [4]:
tokenizer = train_tokenizer(input_ds=ds_raw['train'],lang=config.lang_src,save_path='BPEtokenizer_en.json')




Tokenizer saved to BPEtokenizer_en.json


In [5]:
text = get_all_sentences(ds_raw['train'],lang=config.lang_src)


In [8]:
text= next(text)

In [9]:
res_test_en = tokenizer.encode(text)
print(f"ids:{res_test_en.ids}")
print(f"type_ids:{res_test_en.type_ids}") # type_ids一般用于区分句子类型。例如，在BERT中，type_ids用于区分句子对中的两个句子，分别标记为0或1。在这个例子中，所有的type_ids都是0，表明这是一个单独的句子。
print(f"tokens:{res_test_en.tokens}")
print(f"offsets:{res_test_en.offsets}")

ids:[479, 310, 189, 3841, 15, 1915, 278, 10977, 272, 15, 556, 310, 5556, 272, 189, 438, 4057, 15]
type_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens:['ĠIt', "'s", 'Ġa', 'Ġchallenge', '.', 'ĠGod', 'Ġis', 'Ġchallenging', 'Ġyou', '.', 'ĠHe', "'s", 'Ġcalling', 'Ġyou', 'Ġa', 'Ġch', 'ump', '.']
offsets:[(0, 2), (2, 4), (5, 6), (7, 16), (16, 17), (18, 21), (22, 24), (25, 36), (37, 40), (40, 41), (42, 44), (44, 46), (47, 54), (55, 58), (59, 60), (61, 63), (63, 66), (66, 67)]


In [12]:
print("tokens length:", len(res_test_en.tokens))
print("ids length:", len(res_test_en.ids))
print(f"compression ratio: {len(res_test_en.tokens) / len(res_test_en.ids):.2f}X")

tokens length: 18
ids length: 18
compression ratio: 1.00X


In [13]:
tokenizer = train_tokenizer(input_ds=ds_raw['train'],lang=config.lang_tgt,save_path='BPEtokenizer_zh.json')




Tokenizer saved to BPEtokenizer_zh.json


In [26]:
text_zh = get_all_sentences(ds_raw['train'],lang=config.lang_tgt)

In [28]:
text_zh1 = next(text_zh)

In [29]:
text_zh1

'减轻酸... 酸痛的药 减轻酸痛的药'

In [30]:
res_test_zh = tokenizer.encode(text_zh1)
print(f"ids:{res_test_zh.ids}")
print(f"type_ids:{res_test_zh.type_ids}") 
print(f"tokens:{res_test_zh.tokens}")
print(f"offsets:{res_test_zh.offsets}")

ids:[175, 4634, 9024, 656, 175, 9024, 3070, 213, 1612, 175, 4634, 9024, 3070, 213, 1612]
type_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens:['Ġ', 'åĩıè½»', 'éħ¸', '...', 'Ġ', 'éħ¸', 'çĹĽ', 'çļĦ', 'èį¯', 'Ġ', 'åĩıè½»', 'éħ¸', 'çĹĽ', 'çļĦ', 'èį¯']
offsets:[(0, 0), (0, 2), (2, 3), (3, 6), (7, 7), (7, 8), (8, 9), (9, 10), (10, 11), (12, 12), (12, 14), (14, 15), (15, 16), (16, 17), (17, 18)]
