### 导入德英预训练分词器

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")



### 导入数据

In [2]:
from data_prepare.data_load import train_data_raw, test_data_raw, valid_data_raw
import pandas as pd

train_df, test_df, vad_df = pd.DataFrame(train_data_raw), pd.DataFrame(test_data_raw), pd.DataFrame(valid_data_raw)

### 提取训练数据中的src和trg

In [3]:
def build_src_trg(df: pd.DataFrame):
    df = df.assign(
        de=df['translation'].apply(lambda x: x['de']), en=df['translation'].apply(lambda x: x['en'])).drop(columns=['translation']
    ).head(100000)
    return df

In [4]:
train_src_trg, test_src_trg, vad_src_trg = build_src_trg(train_df), build_src_trg(test_df), build_src_trg(vad_df)

### 微调分词器

In [5]:
train_corpus = train_src_trg['de'].tolist() + train_src_trg['en'].tolist()
test_corpus = test_src_trg['de'].tolist() + test_src_trg['en'].tolist()
valid_corpus = vad_src_trg['de'].tolist() + vad_src_trg['en'].tolist()

In [6]:
# 构建新词汇表
from collections import Counter

new_tokens = []
for text in train_corpus + test_corpus + valid_corpus:
    tokens = tokenizer.tokenize(text)
    new_tokens.extend(tokens)

In [7]:
counter = Counter(new_tokens)

In [8]:
new_vocab = [token for token, freq in counter.items() if freq > 10]

In [3]:
tokenizer.add_tokens(new_vocab)
tokenizer.train_from(new_vocab, epochs=5)

NameError: name 'tokenizer' is not defined

In [10]:
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/source.spm',
 './tokenizer/target.spm',
 './tokenizer/added_tokens.json')

### 将语句分割成idx并进行填充和截断

In [11]:
def tokenizer_pad(row: pd.Series):
    return tokenizer(row, padding="max_length", truncation=True, return_tensors='pt', max_length=512)

In [12]:
def get_data(df: pd.DataFrame):
    return df.assign(src_input=lambda x: x['de'].apply(tokenizer_pad).apply(lambda x: x['input_ids']), trg_input=lambda x: x['en'].apply(tokenizer_pad).apply(lambda x: x['input_ids']), src_mask=lambda x: x['de'].apply(tokenizer_pad).apply(lambda x: x['attention_mask']), trg_mask=lambda x: x['en'].apply(tokenizer_pad).apply(lambda x: x['attention_mask']))

In [13]:
train_data, test_data, valid_data = get_data(train_src_trg), get_data(test_src_trg), get_data(vad_src_trg)

In [36]:
import pickle as pkl
from typing import Literal
from pathlib import Path

def extract_src_trg_w_mask(df: pd.DataFrame, data_type: Literal['train', 'test', 'valid']):
    assert data_type in ['train', 'test', 'valid'], "Data type should be 'train', 'test' or 'valid'"
    src, trg = df['src_input'], df['trg_input']
    src_mask, trg_mask = df['src_mask'], df['trg_mask']
    Path('./data_input').mkdir(exist_ok=True)
        
    with open(f'./data_input/{data_type}_src.pkl', 'wb') as fp:
        pkl.dump(src, fp)
    with open(f'./data_input/{data_type}_trg.pkl', 'wb') as fp:
        pkl.dump(trg, fp)
    with open(f'./data_input/{data_type}_src_mask.pkl', 'wb') as fp:
        pkl.dump(src_mask, fp)
    with open(f'./data_input/{data_type}_trg_mask.pkl', 'wb') as fp:
        pkl.dump(trg_mask, fp)

In [37]:
extract_src_trg_w_mask(train_data, 'train')
extract_src_trg_w_mask(test_data, 'test')
extract_src_trg_w_mask(valid_data, 'valid')

In [2]:
from transformers import MarianTokenizer

MarianTokenizer.from_pretrained('./tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


MarianTokenizer(name_or_path='./tokenizer', vocab_size=58101, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken(",", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	3: AddedToken(".", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	4: AddedToken("▁the", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	5: AddedToken("▁in", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	6: AddedToken("s", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
