In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("datasets version:", version("datasets"))
print("tokenizers version:", version("tokenizers"))

torch version: 2.4.0
datasets version: 3.1.0
tokenizers version: 0.19.1


In [2]:
from datasets import Dataset,load_dataset
from train_tokenizer import Config,get_all_sentences,get_or_train_tokenizer
config = Config()
config.datasource = 'Helsinki-NLP/opus-100'
config.lang_src =  'en'
config.lang_tgt =  'zh'
ds_raw = load_dataset(f"{config.datasource}", f"{config.lang_src}-{config.lang_tgt}",)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
ds_raw

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [None]:
from train_tokenizer import get_or_train_tokenizer
tokenizer_src = get_or_train_tokenizer(config=config,ds=ds_raw['train'],lang=config.lang_src)
tokenizer_tgt = get_or_train_tokenizer(config=config,ds=ds_raw['train'],lang=config.lang_tgt)

In [5]:
def get_pair_data(ds:Dataset):
    for item in ds:
        yield item['translation']

In [6]:
single_data = get_pair_data(ds=ds_raw['train'])
next(single_data)

{'en': 'Sixty-first session', 'zh': '第六十一届会议'}

In [15]:
sentences = next(single_data)
res_test_en = tokenizer_src.encode(sentences['en'])

In [16]:
print(f"ids:{res_test_en.ids}")
print(f"type_ids:{res_test_en.type_ids}") # type_ids一般用于区分句子类型。例如，在BERT中，type_ids用于区分句子对中的两个句子，分别标记为0或1。在这个例子中，所有的type_ids都是0，表明这是一个单独的句子。
print(f"tokens:{res_test_en.tokens}")
print(f"offsets:{res_test_en.offsets}")

ids:[55, 13, 23, 12, 1522, 6, 545, 20, 5829, 22, 6, 128, 13, 23, 2461, 22, 12, 0, 6]
type_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens:['It', "'", 's', 'a', 'challenge', '.', 'God', 'is', 'challenging', 'you', '.', 'He', "'", 's', 'calling', 'you', 'a', '[UNK]', '.']
offsets:[(0, 2), (2, 3), (3, 4), (5, 6), (7, 16), (16, 17), (18, 21), (22, 24), (25, 36), (37, 40), (40, 41), (42, 44), (44, 45), (45, 46), (47, 54), (55, 58), (59, 60), (61, 66), (66, 67)]


In [17]:
print(sentences['zh'])
res_test_zh = tokenizer_tgt.encode(sentences['zh'])
print(f"ids:{res_test_zh.ids}")
print(f"type_ids:{res_test_zh.type_ids}") # type_ids一般用于区分句子类型。例如，在BERT中，type_ids用于区分句子对中的两个句子，分别标记为0或1。在这个例子中，所有的type_ids都是0，表明这是一个单独的句子。
print(f"tokens:{res_test_zh.tokens}")
print(f"offsets:{res_test_zh.offsets}")

上帝在挑战你，他说你是笨蛋
ids:[0, 4, 0]
type_ids:[0, 0, 0]
tokens:['[UNK]', '，', '[UNK]']
offsets:[(0, 6), (6, 7), (7, 13)]
